From 3a18873d2c89483dd52d5e2f3f352ee1fa56917e Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Fri, 17 Mar 2017 12:48:31 +0000 Subject: [PATCH] ICU-12812 Add new implementation, but only expose a limited API as tech preview. Note the XCldrStub class to help migration from CLDR to ICU environment. X-SVN-Rev: 39849 --- .../com/ibm/icu/impl/locale/XCldrStub.java | 390 +++++ .../ibm/icu/impl/locale/XLikelySubtags.java | 667 ++++++++ .../ibm/icu/impl/locale/XLocaleDistance.java | 1338 +++++++++++++++++ .../ibm/icu/impl/locale/XLocaleMatcher.java | 473 ++++++ .../src/com/ibm/icu/util/LocaleMatcher.java | 136 +- .../dev/test/util/DataDrivenTestHelper.java | 187 +++ .../icu/dev/test/util/LocaleMatcherTest.java | 36 +- .../dev/test/util/XLocaleDistanceTest.java | 206 +++ .../icu/dev/test/util/XLocaleMatcherTest.java | 334 ++++ .../dev/test/util/data/localeDistanceTest.txt | 66 + .../dev/test/util/data/localeMatcherTest.txt | 387 +++++ 11 files changed, 4195 insertions(+), 25 deletions(-) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XCldrStub.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java create mode 100644 icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/DataDrivenTestHelper.java create mode 100644 icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java create mode 100644 icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java create mode 100644 icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeDistanceTest.txt create mode 100644 icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XCldrStub.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XCldrStub.java new file mode 100644 index 00000000000..8518491b9ed --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XCldrStub.java @@ -0,0 +1,390 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl.locale; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ICUUncheckedIOException; + +/** + * Stub class to make migration easier until we get either Guava or a higher level of Java. + */ +public class XCldrStub { + + public static class Multimap { + private final Map> map; + private final Class> setClass; + + @SuppressWarnings("unchecked") + private Multimap(Map> map, Class setClass) { + this.map = map; + this.setClass = (Class>) (setClass != null + ? setClass + : HashSet.class); + } + public Multimap putAll(K key, V... values) { + if (values.length != 0) { + createSetIfMissing(key).addAll(Arrays.asList(values)); + } + return this; + } + public void putAll(K key, Collection values) { + if (!values.isEmpty()) { + createSetIfMissing(key).addAll(values); + } + } + public void putAll(Collection keys, V value) { + for (K key : keys) { + put(key, value); + } + } + public void putAll(Multimap source) { + for (Entry> entry : source.map.entrySet()) { + putAll(entry.getKey(), entry.getValue()); + } + } + public void put(K key, V value) { + createSetIfMissing(key).add(value); + } + private Set createSetIfMissing(K key) { + Set old = map.get(key); + if (old == null) { + map.put(key, old = getInstance()); + } + return old; + } + private Set getInstance() { + try { + return setClass.newInstance(); + } catch (Exception e) { + throw new ICUException(e); + } + } + public Set get(K key) { + Set result = map.get(key); + return result; // == null ? Collections.emptySet() : result; + } + public Set keySet() { + return map.keySet(); + } + public Map> asMap() { + return map; + } + public Set values() { + Collection> values = map.values(); + if (values.size() == 0) { + return Collections.emptySet(); + } + Set result = getInstance(); + for ( Set valueSet : values) { + result.addAll(valueSet); + } + return result; + } + public int size() { + return map.size(); + } + public Iterable> entries() { + return new MultimapIterator(map); + } + @Override + public boolean equals(Object obj) { + Multimap other = (Multimap) obj; + return map.equals(other.map); + } + } + + public static class Multimaps { + public static > R invertFrom(Multimap source, R target) { + for (Entry> entry : source.asMap().entrySet()) { + target.putAll(entry.getValue(), entry.getKey()); + } + return target; + } + public static > R invertFrom(Map source, R target) { + for (Entry entry : source.entrySet()) { + target.put(entry.getValue(), entry.getKey()); + } + return target; + } + /** + * Warning, not functionally the same as Guava; only for use in invertFrom. + */ + public static Map forMap(Map map) { + return map; + } + } + + private static class MultimapIterator implements Iterator>, Iterable> { + private final Iterator>> it1; + private Iterator it2 = null; + private final ReusableEntry entry = new ReusableEntry(); + + private MultimapIterator(Map> map) { + it1 = map.entrySet().iterator(); + } + @Override + public boolean hasNext() { + return it1.hasNext() || it2 != null && it2.hasNext(); + } + @Override + public Entry next() { + if (it2 != null && it2.hasNext()) { + entry.value = it2.next(); + } else { + Entry> e = it1.next(); + entry.key = e.getKey(); + it2 = e.getValue().iterator(); + } + return entry; + } + @Override + public Iterator> iterator() { + return this; + } + } + + private static class ReusableEntry implements Entry { + K key; + V value; + @Override + public K getKey() { + return key; + } + @Override + public V getValue() { + return value; + } + @Override + public V setValue(V value) { + throw new UnsupportedOperationException(); + } + } + + public static class HashMultimap extends Multimap { + private HashMultimap() { + super(new HashMap>(), HashSet.class); + } + public static HashMultimap create() { + return new HashMultimap(); + } + } + + public static class TreeMultimap extends Multimap { + private TreeMultimap() { + super(new TreeMap>(), TreeSet.class); + } + public static TreeMultimap create() { + return new TreeMultimap(); + } + } + + public static class LinkedHashMultimap extends Multimap { + private LinkedHashMultimap() { + super(new LinkedHashMap>(), LinkedHashSet.class); + } + public static LinkedHashMultimap create() { + return new LinkedHashMultimap(); + } + } + + + public static class Counter implements Iterable{ + private Map data; + @Override + public Iterator iterator() { + return data.keySet().iterator(); + } + public long get(T s) { + Long result = data.get(s); + return result != null ? result : 0L; + } + public void add(T item, int count) { + Long result = data.get(item); + data.put(item, result == null ? count : result + count); + } + } + + public static String join(T[] source, String separator) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < source.length; ++i) { + if (i != 0) result.append(separator); + result.append(source[i]); + } + return result.toString(); + } + + public static String join(Iterable source, String separator) { + StringBuilder result = new StringBuilder(); + boolean first = true; + for (T item : source) { + if (!first) result.append(separator); + else first = false; + result.append(item.toString()); + } + return result.toString(); + } + + public static class CollectionUtilities { + public static > String join(U source, String separator) { + return XCldrStub.join(source, separator); + } + } + + public static class Joiner { + private final String separator; + private Joiner(String separator) { + this.separator = separator; + } + public static final Joiner on(String separator) { + return new Joiner(separator); + } + public String join(T[] source) { + return XCldrStub.join(source, separator); + } + public String join(Iterable source) { + return XCldrStub.join(source, separator); + } + } + + public static class Splitter { + Pattern pattern; + boolean trimResults = false; + public Splitter(char c) { + this(Pattern.compile("\\Q" + c + "\\E")); + } + public Splitter(Pattern p) { + pattern = p; + } + public static Splitter on(char c) { + return new Splitter(c); + } + public static Splitter on(Pattern p) { + return new Splitter(p); + } + public List splitToList(String input) { + String[] items = pattern.split(input); + if (trimResults) { + for (int i = 0; i < items.length; ++i) { + items[i] = items[i].trim(); + } + } + return Arrays.asList(items); + } + public Splitter trimResults() { + trimResults = true; + return this; + } + public Iterable split(String input) { + return splitToList(input); + } + } + + public static class ImmutableSet { + public static Set copyOf(Set values) { + return Collections.unmodifiableSet(new LinkedHashSet(values)); // copy set for safety, preserve order + } + } + public static class ImmutableMap { + public static Map copyOf(Map values) { + return Collections.unmodifiableMap(new LinkedHashMap(values)); // copy set for safety, preserve order + } + } + public static class ImmutableMultimap { + public static Multimap copyOf(Multimap values) { + LinkedHashMap> temp = new LinkedHashMap>(); // semi-deep copy, preserve order + for (Entry> entry : values.asMap().entrySet()) { + Set value = entry.getValue(); + temp.put(entry.getKey(), value.size() == 1 + ? Collections.singleton(value.iterator().next()) + : Collections.unmodifiableSet(new LinkedHashSet(value))); + } + return new Multimap(Collections.unmodifiableMap(temp), null); + } + } + + public static class FileUtilities { + public static final Charset UTF8 = Charset.forName("utf-8"); + + public static BufferedReader openFile(Class class1, String file) { + return openFile(class1, file, UTF8); + } + + public static BufferedReader openFile(Class class1, String file, Charset charset) { + // URL path = null; + // String externalForm = null; + try { + final InputStream resourceAsStream = class1.getResourceAsStream(file); + if (charset == null) { + charset = UTF8; + } + InputStreamReader reader = new InputStreamReader(resourceAsStream, charset); + BufferedReader bufferedReader = new BufferedReader(reader, 1024 * 64); + return bufferedReader; + } catch (Exception e) { + String className = class1 == null ? null : class1.getCanonicalName(); + String canonicalName = null; + try { + String relativeFileName = getRelativeFileName(class1, "../util/"); + canonicalName = new File(relativeFileName).getCanonicalPath(); + } catch (Exception e1) { + throw new ICUUncheckedIOException("Couldn't open file: " + file + "; relative to class: " + + className, e); + } + throw new ICUUncheckedIOException("Couldn't open file " + file + "; in path " + canonicalName + "; relative to class: " + + className, e); + } + } + public static String getRelativeFileName(Class class1, String filename) { + URL resource = class1.getResource(filename); + String resourceString = resource.toString(); + if (resourceString.startsWith("file:")) { + return resourceString.substring(5); + } else if (resourceString.startsWith("jar:file:")) { + return resourceString.substring(9); + } else { + throw new ICUUncheckedIOException("File not found: " + resourceString); + } + } + } + + static public class RegexUtilities { + public static int findMismatch(Matcher m, CharSequence s) { + int i; + for (i = 1; i < s.length(); ++i) { + boolean matches = m.reset(s.subSequence(0, i)).matches(); + if (!matches && !m.hitEnd()) { + break; + } + } + return i - 1; + } + public static String showMismatch(Matcher m, CharSequence s) { + int failPoint = findMismatch(m, s); + String show = s.subSequence(0, failPoint) + "☹" + s.subSequence(failPoint, s.length()); + return show; + } + } +} \ No newline at end of file diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java new file mode 100644 index 00000000000..28c406e1a90 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java @@ -0,0 +1,667 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl.locale; + +import java.util.Collection; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Set; +import java.util.TreeMap; + +import com.ibm.icu.impl.ICUData; +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.impl.locale.XCldrStub.HashMultimap; +import com.ibm.icu.impl.locale.XCldrStub.Multimap; +import com.ibm.icu.impl.locale.XCldrStub.Multimaps; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.ULocale.Minimize; +import com.ibm.icu.util.UResourceBundle; + +public class XLikelySubtags { + + private static final XLikelySubtags DEFAULT = new XLikelySubtags(); + + public static final XLikelySubtags getDefault() { + return DEFAULT; + } + + static abstract class Maker { + abstract V make(); + + @SuppressWarnings("unchecked") + public V getSubtable(Map langTable, final K language) { + V scriptTable = langTable.get(language); + if (scriptTable == null) { + langTable.put(language, scriptTable = (V) make()); + } + return scriptTable; + } + + static final Maker HASHMAP = new Maker() { + @Override + @SuppressWarnings("unchecked") + public Map make() { + return new HashMap(); + } + }; + + static final Maker TREEMAP = new Maker() { + @Override + @SuppressWarnings("unchecked") + public Map make() { + return new TreeMap(); + } + }; + } + + public static class Aliases { + final Map toCanonical; + final Multimap toAliases; + public String getCanonical(String alias) { + String canonical = toCanonical.get(alias); + return canonical == null ? alias : canonical; + } + public Set getAliases(String canonical) { + Set aliases = toAliases.get(canonical); + return aliases == null ? Collections.singleton(canonical) : aliases; + } + public Aliases(String key) { + UResourceBundle metadata = UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME,"metadata",ICUResourceBundle.ICU_DATA_CLASS_LOADER); + UResourceBundle metadataAlias = metadata.get("alias"); + UResourceBundle territoryAlias = metadataAlias.get(key); + Map toCanonical1 = new HashMap(); + for ( int i = 0 ; i < territoryAlias.getSize(); i++ ) { + UResourceBundle res = territoryAlias.get(i); + String aliasFrom = res.getKey(); + if (aliasFrom.contains("_")) { + continue; // only simple aliasing + } + String aliasReason = res.get("reason").getString(); + if (aliasReason.equals("overlong")) { + continue; + } + String aliasTo = res.get("replacement").getString(); + int spacePos = aliasTo.indexOf(' '); + String aliasFirst = spacePos < 0 ? aliasTo : aliasTo.substring(0, spacePos); + if (aliasFirst.contains("_")) { + continue; // only simple aliasing + } + toCanonical1.put(aliasFrom, aliasFirst); + } + if (key.equals("language")) { + toCanonical1.put("mo", "ro"); // special case + } + toCanonical = Collections.unmodifiableMap(toCanonical1); + toAliases = Multimaps.invertFrom(toCanonical1, HashMultimap.create()); + } + } + + public static class LSR { + public final String language; + public final String script; + public final String region; + + public static Aliases LANGUAGE_ALIASES = new Aliases("language"); + public static Aliases REGION_ALIASES = new Aliases("territory"); + + public static LSR from(String language, String script, String region) { + return new LSR(language, script, region); + } + + // from http://unicode.org/reports/tr35/#Unicode_language_identifier + // but simplified to requiring language subtag, and nothing beyond region + // #1 is language + // #2 is script + // #3 is region +// static final String pat = +// "language_id = (unicode_language_subtag)" +// + "(?:sep(unicode_script_subtag))?" +// + "(?:sep(unicode_region_subtag))?;\n" +// + "unicode_language_subtag = alpha{2,3}|alpha{5,8};\n" +// + "unicode_script_subtag = alpha{4};\n" +// + "unicode_region_subtag = alpha{2}|digit{3};\n" +// + "sep = [-_];\n" +// + "digit = [0-9];\n" +// + "alpha = [A-Za-z];\n" +// ; +// static { +// System.out.println(pat); +// System.out.println(new UnicodeRegex().compileBnf(pat)); +// } +// static final Pattern LANGUAGE_PATTERN = Pattern.compile( +// "([a-zA-Z0-9]+)" // (?:[-_]([a-zA-Z0-9]+))?(?:[-_]([a-zA-Z0-9]+))?" +// //new UnicodeRegex().compileBnf(pat) +// ); +// + // TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future. + static LSR from(String languageIdentifier) { + String[] parts = languageIdentifier.split("[-_]"); + if (parts.length < 1 || parts.length > 3) { + throw new ICUException("too many subtags"); + } + String lang = parts[0].toLowerCase(); + String p2 = parts.length < 2 ? "": parts[1]; + String p3 = parts.length < 3 ? "": parts[2]; + return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3); + + // Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier); + // if (!matcher.matches()) { + // return new LSR(matcher.group(1), matcher.group(2), matcher.group(3)); + // } + // System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier)); + // throw new ICUException("invalid language id"); + } + + public static LSR from(ULocale locale) { + return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry()); + } + + public static LSR fromMaximalized(ULocale locale) { + return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry()); + } + + public static LSR fromMaximalized(String language, String script, String region) { + String canonicalLanguage = LANGUAGE_ALIASES.getCanonical(language); + // script is ok + String canonicalRegion = REGION_ALIASES.getCanonical(region); // getCanonical(REGION_ALIASES.get(region)); + + return DEFAULT.maximize(canonicalLanguage, script, canonicalRegion); + } + + public LSR(String language, String script, String region) { + this.language = language; + this.script = script; + this.region = region; + } + + @Override + public String toString() { + StringBuilder result = new StringBuilder(language); + if (!script.isEmpty()) { + result.append('-').append(script); + } + if (!region.isEmpty()) { + result.append('-').append(region); + } + return result.toString(); + } + public LSR replace(String language2, String script2, String region2) { + if (language2 == null && script2 == null && region2 == null) return this; + return new LSR( + language2 == null ? language: language2, + script2 == null ? script : script2, + region2 == null ? region : region2); + } + @Override + public boolean equals(Object obj) { + LSR other = (LSR) obj; + return language.equals(other.language) + && script.equals(other.script) + && region.equals(other.region); + } + @Override + public int hashCode() { + return Objects.hash(language, script, region); + } + } + + final Map>> langTable; + + public XLikelySubtags() { + this(getDefaultRawData(), true); + } + + private static Map getDefaultRawData() { + Map rawData = new TreeMap(); + UResourceBundle bundle = UResourceBundle.getBundleInstance( ICUData.ICU_BASE_NAME, "likelySubtags"); + for (Enumeration enumer = bundle.getKeys(); enumer.hasMoreElements();) { + String key = enumer.nextElement(); + rawData.put(key, bundle.getString(key)); + } + return rawData; + } + + public XLikelySubtags(Map rawData, boolean skipNoncanonical) { + this.langTable = init(rawData, skipNoncanonical); + } + + private Map>> init(final Map rawData, boolean skipNoncanonical) { + // prepare alias info. We want a mapping from the canonical form to all aliases + + //Multimap canonicalToAliasLanguage = HashMultimap.create(); + // getAliasInfo(LANGUAGE_ALIASES, canonicalToAliasLanguage); + + // Don't bother with script; there are none + + //Multimap canonicalToAliasRegion = HashMultimap.create(); + // getAliasInfo(REGION_ALIASES, canonicalToAliasRegion); + + Maker maker = Maker.TREEMAP; + Map>> result = maker.make(); +// Splitter bar = Splitter.on('_'); +// int last = -1; + // set the base data + Map internCache = new HashMap(); + for (Entry sourceTarget : rawData.entrySet()) { + LSR ltp = LSR.from(sourceTarget.getKey()); + final String language = ltp.language; + final String script = ltp.script; + final String region = ltp.region; + + ltp = LSR.from(sourceTarget.getValue()); + String languageTarget = ltp.language; + final String scriptTarget = ltp.script; + final String regionTarget = ltp.region; + + set(result, language, script, region, languageTarget, scriptTarget, regionTarget, internCache); + // now add aliases + Collection languageAliases = LSR.LANGUAGE_ALIASES.getAliases(language); +// if (languageAliases.isEmpty()) { +// languageAliases = Collections.singleton(language); +// } + Collection regionAliases = LSR.REGION_ALIASES.getAliases(region); +// if (regionAliases.isEmpty()) { +// regionAliases = Collections.singleton(region); +// } + for (String languageAlias : languageAliases) { + for (String regionAlias : regionAliases) { + if (languageAlias.equals(language) && regionAlias.equals(region)) { + continue; + } + set(result, languageAlias, script, regionAlias, languageTarget, scriptTarget, regionTarget, internCache); + } + } + } + // hack + set(result, "und", "Latn", "", "en", "Latn", "US", internCache); + + // hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table + // + + // so und-Latn-GH => ak-Latn-GH + Map> undScriptMap = result.get("und"); + Map undEmptyRegionMap = undScriptMap.get(""); + for (Entry regionEntry : undEmptyRegionMap.entrySet()) { + final LSR value = regionEntry.getValue(); + set(result, "und", value.script, value.region, value); + } + // + // check that every level has "" (or "und") + if (!result.containsKey("und")) { + throw new IllegalArgumentException("failure: base"); + } + for (Entry>> langEntry : result.entrySet()) { + String lang = langEntry.getKey(); + final Map> scriptMap = langEntry.getValue(); + if (!scriptMap.containsKey("")) { + throw new IllegalArgumentException("failure: " + lang); + } + for (Entry> scriptEntry : scriptMap.entrySet()) { + String script = scriptEntry.getKey(); + final Map regionMap = scriptEntry.getValue(); + if (!regionMap.containsKey("")) { + throw new IllegalArgumentException("failure: " + lang + "-" + script); + } + // for (Entry regionEntry : regionMap.entrySet()) { + // String region = regionEntry.getKey(); + // LSR value = regionEntry.getValue(); + // } + } + } + return result; + } + +// private void getAliasInfo(Map, String>> aliasInfo, Multimap canonicalToAlias) { +// for (Entry, String>> e : aliasInfo.entrySet()) { +// final String alias = e.getKey(); +// if (alias.contains("_")) { +// continue; // only do simple aliasing +// } +// String canonical = getCanonical(e.getValue()); +// canonicalToAlias.put(canonical, alias); +// } +// } + +// private static String getCanonical(R2, String> aliasAndReason) { +// if (aliasAndReason == null) { +// return null; +// } +// if (aliasAndReason.get1().equals("overlong")) { +// return null; +// } +// List value = aliasAndReason.get0(); +// if (value.size() != 1) { +// return null; +// } +// final String canonical = value.iterator().next(); +// if (canonical.contains("_")) { +// return null; // only do simple aliasing +// } +// return canonical; +// } + + private void set(Map>> langTable, final String language, final String script, final String region, + final String languageTarget, final String scriptTarget, final String regionTarget, Map internCache) { + LSR newValue = new LSR(languageTarget, scriptTarget, regionTarget); + LSR oldValue = internCache.get(newValue); + if (oldValue == null) { + internCache.put(newValue, newValue); + oldValue = newValue; + } + set(langTable, language, script, region, oldValue); + } + + private void set(Map>> langTable, final String language, final String script, final String region, LSR newValue) { + Map> scriptTable = Maker.TREEMAP.getSubtable(langTable, language); + Map regionTable = Maker.TREEMAP.getSubtable(scriptTable, script); + LSR oldValue = regionTable.get(region); + if (oldValue != null) { + int debug = 0; + } + regionTable.put(region, newValue); + } + + /** + * Convenience methods + * @param source + * @return + */ + public LSR maximize(String source) { + return maximize(ULocale.forLanguageTag(source)); + } + + public LSR maximize(ULocale source) { + return maximize(source.getLanguage(), source.getScript(), source.getCountry()); + } + + public LSR maximize(LSR source) { + return maximize(source.language, source.script, source.region); + } + + // public static ULocale addLikelySubtags(ULocale loc) { + // + // } + + /** + * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN". + */ + public LSR maximize(String language, String script, String region) { + int retainOldMask = 0; + Map> scriptTable = langTable.get(language); + if (scriptTable == null) { // cannot happen if language == "und" + retainOldMask |= 4; + scriptTable = langTable.get("und"); + } else if (!language.equals("und")) { + retainOldMask |= 4; + } + + if (script.equals("Zzzz")) { + script = ""; + } + Map regionTable = scriptTable.get(script); + if (regionTable == null) { // cannot happen if script == "" + retainOldMask |= 2; + regionTable = scriptTable.get(""); + } else if (!script.isEmpty()) { + retainOldMask |= 2; + } + + if (region.equals("ZZ")) { + region = ""; + } + LSR result = regionTable.get(region); + if (result == null) { // cannot happen if region == "" + retainOldMask |= 1; + result = regionTable.get(""); + if (result == null) { + return null; + } + } else if (!region.isEmpty()) { + retainOldMask |= 1; + } + + switch (retainOldMask) { + default: + case 0: return result; + case 1: return result.replace(null, null, region); + case 2: return result.replace(null, script, null); + case 3: return result.replace(null, script, region); + case 4: return result.replace(language, null, null); + case 5: return result.replace(language, null, region); + case 6: return result.replace(language, script, null); + case 7: return result.replace(language, script, region); + } + } + + private LSR minimizeSubtags(String languageIn, String scriptIn, String regionIn, Minimize fieldToFavor) { + LSR result = maximize(languageIn, scriptIn, regionIn); + + // We could try just a series of checks, like: + // LSR result2 = addLikelySubtags(languageIn, "", ""); + // if result.equals(result2) return result2; + // However, we can optimize 2 of the cases: + // (languageIn, "", "") + // (languageIn, "", regionIn) + + Map> scriptTable = langTable.get(result.language); + + Map regionTable0 = scriptTable.get(""); + LSR value00 = regionTable0.get(""); + boolean favorRegionOk = false; + if (result.script.equals(value00.script)) { //script is default + if (result.region.equals(value00.region)) { + return result.replace(null, "", ""); + } else if (fieldToFavor == fieldToFavor.FAVOR_REGION) { + return result.replace(null, "", null); + } else { + favorRegionOk = true; + } + } + + // The last case is not as easy to optimize. + // Maybe do later, but for now use the straightforward code. + LSR result2 = maximize(languageIn, scriptIn, ""); + if (result2.equals(result)) { + return result.replace(null, null, ""); + } else if (favorRegionOk) { + return result.replace(null, "", null); + } + return result; + } + + private static StringBuilder show(Map map, String indent, StringBuilder output) { + String first = indent.isEmpty() ? "" : "\t"; + for (Entry e : map.entrySet()) { + String key = e.getKey(); + V value = e.getValue(); + output.append(first + (key.isEmpty() ? "∅" : key)); + if (value instanceof Map) { + show((Map)value, indent+"\t", output); + } else { + output.append("\t" + Objects.toString(value)).append("\n"); + } + first = indent; + } + return output; + } + + @Override + public String toString() { + return show(langTable, "", new StringBuilder()).toString(); + } + + // public static void main(String[] args) { + // System.out.println(LSR.fromMaximalized(ULocale.ENGLISH)); + // + // final Map rawData = sdi.getLikelySubtags(); + // XLikelySubtags ls = XLikelySubtags.getDefault(); + // System.out.println(ls); + // ls.maximize(new ULocale("iw")); + // if (true) return; + // + // LanguageTagParser ltp = new LanguageTagParser(); + // + // // get all the languages, scripts, and regions + // Set languages = new TreeSet(); + // Set scripts = new TreeSet(); + // Set regions = new TreeSet(); + // Counter languageCounter = new Counter(); + // Counter scriptCounter = new Counter(); + // Counter regionCounter = new Counter(); + // + // for (Entry sourceTarget : rawData.entrySet()) { + // final String source = sourceTarget.getKey(); + // ltp.set(source); + // languages.add(ltp.getLanguage()); + // scripts.add(ltp.getScript()); + // regions.add(ltp.getRegion()); + // final String target = sourceTarget.getValue(); + // ltp.set(target); + // add(target, languageCounter, ltp.getLanguage(), 1); + // add(target, scriptCounter, ltp.getScript(), 1); + // add(target, regionCounter, ltp.getRegion(), 1); + // } + // ltp.set("und-Zzzz-ZZ"); + // languageCounter.add(ltp.getLanguage(), 1); + // scriptCounter.add(ltp.getScript(), 1); + // regionCounter.add(ltp.getRegion(), 1); + // + // if (SHORT) { + // removeSingletons(languages, languageCounter); + // removeSingletons(scripts, scriptCounter); + // removeSingletons(regions, regionCounter); + // } + // + // System.out.println("languages: " + languages.size() + "\n\t" + languages + "\n\t" + languageCounter); + // System.out.println("scripts: " + scripts.size() + "\n\t" + scripts + "\n\t" + scriptCounter); + // System.out.println("regions: " + regions.size() + "\n\t" + regions + "\n\t" + regionCounter); + // + // int maxCount = Integer.MAX_VALUE; + // + // int counter = maxCount; + // long tempTime = System.nanoTime(); + // newMax: + // for (String language : languages) { + // for (String script : scripts) { + // for (String region : regions) { + // if (--counter < 0) break newMax; + // LSR result = ls.maximize(language, script, region); + // } + // } + // } + // long newMaxTime = System.nanoTime() - tempTime; + // System.out.println("newMaxTime: " + newMaxTime); + // + // counter = maxCount; + // tempTime = System.nanoTime(); + // newMin: + // for (String language : languages) { + // for (String script : scripts) { + // for (String region : regions) { + // if (--counter < 0) break newMin; + // LSR minNewS = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_SCRIPT); + // } + // } + // } + // long newMinTime = System.nanoTime() - tempTime; + // System.out.println("newMinTime: " + newMinTime); + // + // // ***** + // + // tempTime = System.nanoTime(); + // counter = maxCount; + // oldMax: + // for (String language : languages) { + // for (String script : scripts) { + // for (String region : regions) { + // if (--counter < 0) break oldMax; + // ULocale tempLocale = new ULocale(language, script, region); + // ULocale max = ULocale.addLikelySubtags(tempLocale); + // } + // } + // } + // long oldMaxTime = System.nanoTime() - tempTime; + // System.out.println("oldMaxTime: " + oldMaxTime + "\t" + oldMaxTime/newMaxTime + "x"); + // + // counter = maxCount; + // tempTime = System.nanoTime(); + // oldMin: + // for (String language : languages) { + // for (String script : scripts) { + // for (String region : regions) { + // if (--counter < 0) break oldMin; + // ULocale tempLocale = new ULocale(language, script, region); + // ULocale minOldS = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_SCRIPT); + // } + // } + // } + // long oldMinTime = System.nanoTime() - tempTime; + // System.out.println("oldMinTime: " + oldMinTime + "\t" + oldMinTime/newMinTime + "x"); + // + // counter = maxCount; + // testMain: + // for (String language : languages) { + // System.out.println(language); + // int tests = 0; + // for (String script : scripts) { + // for (String region : regions) { + // ++tests; + // if (--counter < 0) break testMain; + // LSR maxNew = ls.maximize(language, script, region); + // LSR minNewS = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_SCRIPT); + // LSR minNewR = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_REGION); + // + // ULocale tempLocale = new ULocale(language, script, region); + // ULocale maxOld = ULocale.addLikelySubtags(tempLocale); + // ULocale minOldS = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_SCRIPT); + // ULocale minOldR = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_REGION); + // + // // check values + // final String maxNewS = String.valueOf(maxNew); + // final String maxOldS = maxOld.toLanguageTag(); + // boolean sameMax = maxOldS.equals(maxNewS); + // + // final String minNewSS = String.valueOf(minNewS); + // final String minOldSS = minOldS.toLanguageTag(); + // boolean sameMinS = minNewSS.equals(minOldSS); + // + // final String minNewRS = String.valueOf(minNewR); + // final String minOldRS = minOldS.toLanguageTag(); + // boolean sameMinR = minNewRS.equals(minOldRS); + // + // if (sameMax && sameMinS && sameMinR) continue; + // System.out.println(new LSR(language, script, region) + // + "\tmax: " + maxNew + // + (sameMax ? "" : "≠" + maxOldS) + // + "\tminS: " + minNewS + // + (sameMinS ? "" : "≠" + minOldS) + // + "\tminR: " + minNewR + // + (sameMinR ? "" : "≠" + minOldR) + // ); + // } + // } + // System.out.println(language + ": " + tests); + // } + // } + // + // private static void add(String target, Counter languageCounter, String language, int count) { + // if (language.equals("aa")) { + // int debug = 0; + // } + // languageCounter.add(language, count); + // } + // + // private static void removeSingletons(Set languages, Counter languageCounter) { + // for (String s : languageCounter) { + // final long count = languageCounter.get(s); + // if (count <= 1) { + // languages.remove(s); + // } + // } + // } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java new file mode 100644 index 00000000000..6f15a280cb0 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java @@ -0,0 +1,1338 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl.locale; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.function.Predicate; + +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.impl.locale.XCldrStub.CollectionUtilities; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableMultimap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; +import com.ibm.icu.impl.locale.XCldrStub.LinkedHashMultimap; +import com.ibm.icu.impl.locale.XCldrStub.Multimap; +import com.ibm.icu.impl.locale.XCldrStub.Multimaps; +import com.ibm.icu.impl.locale.XCldrStub.Splitter; +import com.ibm.icu.impl.locale.XCldrStub.TreeMultimap; +import com.ibm.icu.impl.locale.XLikelySubtags.LSR; +import com.ibm.icu.impl.locale.XLocaleDistance.RegionMapper.Builder; +import com.ibm.icu.text.LocaleDisplayNames; +import com.ibm.icu.util.LocaleMatcher; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.UResourceBundleIterator; + +public class XLocaleDistance { + + static final boolean PRINT_OVERRIDES = false; + + public static final int ABOVE_THRESHOLD = 100; + + @Deprecated + public static final String ANY = "�"; // matches any character. Uses value above any subtag. + + private static String fixAny(String string) { + return "*".equals(string) ? ANY : string; + } + + static final LocaleDisplayNames english = LocaleDisplayNames.getInstance(ULocale.ENGLISH); + + private static List> xGetLanguageMatcherData() { + List> distanceList = new ArrayList>(); + + ICUResourceBundle suppData = LocaleMatcher.getICUSupplementalData(); + ICUResourceBundle languageMatchingNew = suppData.findTopLevel("languageMatchingNew"); + ICUResourceBundle written = (ICUResourceBundle) languageMatchingNew.get("written"); + + for(UResourceBundleIterator iter = written.getIterator(); iter.hasNext();) { + ICUResourceBundle item = (ICUResourceBundle) iter.next(); + boolean oneway = item.getSize() > 3 && "1".equals(item.getString(3)); + distanceList.add( + (R4) // note: .freeze returning wrong type, so casting. + Row.of( + item.getString(0), + item.getString(1), + Integer.parseInt(item.getString(2)), + oneway) + .freeze()); + } + return Collections.unmodifiableList(distanceList); + } + + private static Set xGetParadigmLocales() { + ICUResourceBundle suppData = LocaleMatcher.getICUSupplementalData(); + ICUResourceBundle languageMatchingInfo = suppData.findTopLevel("languageMatchingInfo"); + ICUResourceBundle writtenParadigmLocales = (ICUResourceBundle) languageMatchingInfo.get("written") + .get("paradigmLocales"); +// paradigmLocales{ "en", "en-GB",... } + HashSet paradigmLocales = new HashSet(Arrays.asList(writtenParadigmLocales.getStringArray())); + return Collections.unmodifiableSet(paradigmLocales); + } + + private static Map xGetMatchVariables() { + ICUResourceBundle suppData = LocaleMatcher.getICUSupplementalData(); + ICUResourceBundle languageMatchingInfo = suppData.findTopLevel("languageMatchingInfo"); + ICUResourceBundle writtenMatchVariables = (ICUResourceBundle) languageMatchingInfo.get("written") + .get("matchVariable"); +// matchVariable{ americas{"019"} cnsar{"HK+MO"} ...} + + HashMap matchVariables = new HashMap(); + for (Enumeration enumer = writtenMatchVariables.getKeys(); enumer.hasMoreElements(); ) { + String key = enumer.nextElement(); + matchVariables.put(key, writtenMatchVariables.getString(key)); + } + return Collections.unmodifiableMap(matchVariables); + } + + private static Multimap xGetContainment() { + TreeMultimap containment = TreeMultimap.create(); + containment + .putAll("001", "019", "002", "150", "142", "009") + .putAll("011", "BF", "BJ", "CI", "CV", "GH", "GM", "GN", "GW", "LR", "ML", "MR", "NE", "NG", "SH", "SL", "SN", "TG") + .putAll("013", "BZ", "CR", "GT", "HN", "MX", "NI", "PA", "SV") + .putAll("014", "BI", "DJ", "ER", "ET", "KE", "KM", "MG", "MU", "MW", "MZ", "RE", "RW", "SC", "SO", "SS", "TZ", "UG", "YT", "ZM", "ZW") + .putAll("142", "145", "143", "030", "034", "035") + .putAll("143", "TM", "TJ", "KG", "KZ", "UZ") + .putAll("145", "AE", "AM", "AZ", "BH", "CY", "GE", "IL", "IQ", "JO", "KW", "LB", "OM", "PS", "QA", "SA", "SY", "TR", "YE", "NT", "YD") + .putAll("015", "DZ", "EG", "EH", "LY", "MA", "SD", "TN", "EA", "IC") + .putAll("150", "154", "155", "151", "039") + .putAll("151", "BG", "BY", "CZ", "HU", "MD", "PL", "RO", "RU", "SK", "UA", "SU") + .putAll("154", "GG", "IM", "JE", "AX", "DK", "EE", "FI", "FO", "GB", "IE", "IS", "LT", "LV", "NO", "SE", "SJ") + .putAll("155", "AT", "BE", "CH", "DE", "FR", "LI", "LU", "MC", "NL", "DD", "FX") + .putAll("017", "AO", "CD", "CF", "CG", "CM", "GA", "GQ", "ST", "TD", "ZR") + .putAll("018", "BW", "LS", "NA", "SZ", "ZA") + .putAll("019", "021", "013", "029", "005", "003", "419") + .putAll("002", "015", "011", "017", "014", "018") + .putAll("021", "BM", "CA", "GL", "PM", "US") + .putAll("029", "AG", "AI", "AW", "BB", "BL", "BQ", "BS", "CU", "CW", "DM", "DO", "GD", "GP", "HT", "JM", "KN", "KY", "LC", "MF", "MQ", "MS", "PR", "SX", "TC", "TT", "VC", "VG", "VI", "AN") + .putAll("003", "021", "013", "029") + .putAll("030", "CN", "HK", "JP", "KP", "KR", "MN", "MO", "TW") + .putAll("035", "BN", "ID", "KH", "LA", "MM", "MY", "PH", "SG", "TH", "TL", "VN", "BU", "TP") + .putAll("039", "AD", "AL", "BA", "ES", "GI", "GR", "HR", "IT", "ME", "MK", "MT", "RS", "PT", "SI", "SM", "VA", "XK", "CS", "YU") + .putAll("419", "013", "029", "005") + .putAll("005", "AR", "BO", "BR", "CL", "CO", "EC", "FK", "GF", "GY", "PE", "PY", "SR", "UY", "VE") + .putAll("053", "AU", "NF", "NZ") + .putAll("054", "FJ", "NC", "PG", "SB", "VU") + .putAll("057", "FM", "GU", "KI", "MH", "MP", "NR", "PW") + .putAll("061", "AS", "CK", "NU", "PF", "PN", "TK", "TO", "TV", "WF", "WS") + .putAll("034", "AF", "BD", "BT", "IN", "IR", "LK", "MV", "NP", "PK") + .putAll("009", "053", "054", "057", "061", "QO") + .putAll("QO", "AQ", "BV", "CC", "CX", "GS", "HM", "IO", "TF", "UM", "AC", "CP", "DG", "TA") + ; + //Can't use following, because data from CLDR is discarded +// ICUResourceBundle suppData = LocaleMatcher.getICUSupplementalData(); +// UResourceBundle territoryContainment = suppData.get("territoryContainment"); +// for (int i = 0 ; i < territoryContainment.getSize(); i++) { +// UResourceBundle mapping = territoryContainment.get(i); +// String parent = mapping.getKey(); +// for (int j = 0 ; j < mapping.getSize(); j++) { +// String child = mapping.getString(j); +// containment.put(parent,child); +// System.out.println(parent + " => " + child); +// } +// } + TreeMultimap containmentResolved = TreeMultimap.create(); + fill("001", containment, containmentResolved); + return ImmutableMultimap.copyOf(containmentResolved); + } + + private static Set fill(String region, TreeMultimap containment, Multimap toAddTo) { + Set contained = containment.get(region); + if (contained == null) { + return Collections.emptySet(); + } + toAddTo.putAll(region, contained); // do top level + // then recursively + for (String subregion : contained) { + toAddTo.putAll(region, fill(subregion, containment, toAddTo)); + } + return toAddTo.get(region); + } + + + static final Multimap CONTAINER_TO_CONTAINED; + static final Multimap CONTAINER_TO_CONTAINED_FINAL; + static { +// Multimap containerToContainedTemp = xGetContainment(); +// fill(Region.getInstance("001"), containerToContainedTemp); + + CONTAINER_TO_CONTAINED = xGetContainment(); + Multimap containerToFinalContainedBuilder = TreeMultimap.create(); + for (Entry> entry : CONTAINER_TO_CONTAINED.asMap().entrySet()) { + String container = entry.getKey(); + for (String contained : entry.getValue()) { + if (CONTAINER_TO_CONTAINED.get(contained) == null) { + containerToFinalContainedBuilder.put(container, contained); + } + } + } + CONTAINER_TO_CONTAINED_FINAL = ImmutableMultimap.copyOf(containerToFinalContainedBuilder); + } + + final static private Set ALL_FINAL_REGIONS = ImmutableSet.copyOf(CONTAINER_TO_CONTAINED_FINAL.get("001")); + + // end of data from CLDR + + private final DistanceTable languageDesired2Supported; + private final RegionMapper regionMapper; + private final int defaultLanguageDistance; + private final int defaultScriptDistance; + private final int defaultRegionDistance; + + @Deprecated + public static abstract class DistanceTable { + abstract int getDistance(String desiredLang, String supportedlang, Output table, boolean starEquals); + abstract Set getCloser(int threshold); + abstract String toString(boolean abbreviate); + public DistanceTable compact() { + return this; + } + // public Integer getInternalDistance(String a, String b) { + // return null; + // } + public DistanceNode getInternalNode(String any, String any2) { + return null; + } + public Map> getInternalMatches() { + return null; + } + public boolean isEmpty() { + return true; + } + } + + @Deprecated + public static class DistanceNode { + final int distance; + + public DistanceNode(int distance) { + this.distance = distance; + } + + public DistanceTable getDistanceTable() { + return null; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof DistanceNode)) { + return false; + } + DistanceNode other = (DistanceNode) obj; + return distance == other.distance; + } + @Override + public int hashCode() { + return distance; + } + @Override + public String toString() { + return "\ndistance: " + distance; + } + } + + private interface IdMapper { + public V toId(K source); + } + + static class IdMakerFull implements IdMapper { + private final Map objectToInt = new HashMap(); + private final List intToObject = new ArrayList(); + final String name; // for debugging + + IdMakerFull(String name) { + this.name = name; + } + + IdMakerFull() { + this("unnamed"); + } + + IdMakerFull(String name, T zeroValue) { + this(name); + add(zeroValue); + } + + /** + * Return an id, making one if there wasn't one already. + */ + public Integer add(T source) { + Integer result = objectToInt.get(source); + if (result == null) { + Integer newResult = intToObject.size(); + objectToInt.put(source, newResult); + intToObject.add(source); + return newResult; + } else { + return result; + } + } + + /** + * Return an id, or null if there is none. + */ + @Override + public Integer toId(T source) { + return objectToInt.get(source); + // return value == null ? 0 : value; + } + + /** + * Return the object for the id, or null if there is none. + */ + public T fromId(int id) { + return intToObject.get(id); + } + + /** + * Return interned object + */ + public T intern(T source) { + return fromId(add(source)); + } + + public int size() { + return intToObject.size(); + } + /** + * Same as add, except if the object didn't have an id, return null; + */ + public Integer getOldAndAdd(T source) { + Integer result = objectToInt.get(source); + if (result == null) { + Integer newResult = intToObject.size(); + objectToInt.put(source, newResult); + intToObject.add(source); + } + return result; + } + + @Override + public String toString() { + return size() + ": " + intToObject; + } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof IdMakerFull)) { + return false; + } + IdMakerFull other = (IdMakerFull) obj; + return intToObject.equals(other.intToObject); + } + @Override + public int hashCode() { + return intToObject.hashCode(); + } + } + + static class StringDistanceNode extends DistanceNode { + final DistanceTable distanceTable; + + public StringDistanceNode(int distance, DistanceTable distanceTable) { + super(distance); + this.distanceTable = distanceTable; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof StringDistanceNode)) { + return false; + } + StringDistanceNode other = (StringDistanceNode) obj; + return distance == other.distance && Objects.equals(distanceTable, other.distanceTable); + } + @Override + public int hashCode() { + return distance ^ Objects.hashCode(distanceTable); + } + + StringDistanceNode(int distance) { + this(distance, new StringDistanceTable()); + } + + public void addSubtables(String desiredSub, String supportedSub, CopyIfEmpty r) { + ((StringDistanceTable) distanceTable).addSubtables(desiredSub, supportedSub, r); + } + @Override + public String toString() { + return "distance: " + distance + "\n" + distanceTable; + } + + public void copyTables(StringDistanceTable value) { + if (value != null) { + ((StringDistanceTable)distanceTable).copy(value); + } + } + + @Override + public DistanceTable getDistanceTable() { + return distanceTable; + } + } + + public XLocaleDistance(DistanceTable datadistancetable2, RegionMapper regionMapper) { + languageDesired2Supported = datadistancetable2; + this.regionMapper = regionMapper; + + StringDistanceNode languageNode = (StringDistanceNode) ((StringDistanceTable) languageDesired2Supported).subtables.get(ANY).get(ANY); + defaultLanguageDistance = languageNode.distance; + StringDistanceNode scriptNode = (StringDistanceNode) ((StringDistanceTable)languageNode.distanceTable).subtables.get(ANY).get(ANY); + defaultScriptDistance = scriptNode.distance; + DistanceNode regionNode = ((StringDistanceTable)scriptNode.distanceTable).subtables.get(ANY).get(ANY); + defaultRegionDistance = regionNode.distance; + } + + private static Map newMap() { // for debugging + return new TreeMap(); + } + + /** + * Internal class + */ + @Deprecated + public static class StringDistanceTable extends DistanceTable { + final Map> subtables; + + StringDistanceTable(Map> tables) { + subtables = tables; + } + StringDistanceTable() { + this(newMap()); + } + + @Override + public boolean isEmpty() { + return subtables.isEmpty(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof StringDistanceTable)) { + return false; + } + StringDistanceTable other = (StringDistanceTable) obj; + return subtables.equals(other.subtables); + } + @Override + public int hashCode() { + return subtables.hashCode(); + } + + @Override + public int getDistance(String desired, String supported, Output distanceTable, boolean starEquals) { + boolean star = false; + Map sub2 = subtables.get(desired); + if (sub2 == null) { + sub2 = subtables.get(ANY); // <*, supported> + star = true; + } + DistanceNode value = sub2.get(supported); // <*/desired, supported> + if (value == null) { + value = sub2.get(ANY); // <*/desired, *> + if (value == null && !star) { + sub2 = subtables.get(ANY); // <*, supported> + value = sub2.get(supported); + if (value == null) { + value = sub2.get(ANY); // <*, *> + } + } + star = true; + } + if (distanceTable != null) { + distanceTable.value = ((StringDistanceNode) value).distanceTable; + } + return starEquals && star && desired.equals(supported) ? 0 : value.distance; + } + + public void copy(StringDistanceTable other) { + for (Entry> e1 : other.subtables.entrySet()) { + for (Entry e2 : e1.getValue().entrySet()) { + DistanceNode value = e2.getValue(); + DistanceNode subNode = addSubtable(e1.getKey(), e2.getKey(), value.distance); + } + } + } + + DistanceNode addSubtable(String desired, String supported, int distance) { + Map sub2 = subtables.get(desired); + if (sub2 == null) { + subtables.put(desired, sub2 = newMap()); + } + DistanceNode oldNode = sub2.get(supported); + if (oldNode != null) { + return oldNode; + } + + final StringDistanceNode newNode = new StringDistanceNode(distance); + sub2.put(supported, newNode); + return newNode; + } + + /** + * Return null if value doesn't exist + */ + private DistanceNode getNode(String desired, String supported) { + Map sub2 = subtables.get(desired); + if (sub2 == null) { + return null; + } + return sub2.get(supported); + } + + + /** add table for each subitem that matches and doesn't have a table already + */ + public void addSubtables( + String desired, String supported, + Predicate action) { + int count = 0; + DistanceNode node = getNode(desired, supported); + if (node == null) { + // get the distance it would have + Output node2 = new Output(); + int distance = getDistance(desired, supported, node2, true); + // now add it + node = addSubtable(desired, supported, distance); + if (node2.value != null) { + ((StringDistanceNode)node).copyTables((StringDistanceTable)(node2.value)); + } + } + action.test(node); + } + + public void addSubtables(String desiredLang, String supportedLang, + String desiredScript, String supportedScript, + int percentage) { + + // add to all the values that have the matching desiredLang and supportedLang + boolean haveKeys = false; + for (Entry> e1 : subtables.entrySet()) { + String key1 = e1.getKey(); + final boolean desiredIsKey = desiredLang.equals(key1); + if (desiredIsKey || desiredLang.equals(ANY)) { + for (Entry e2 : e1.getValue().entrySet()) { + String key2 = e2.getKey(); + final boolean supportedIsKey = supportedLang.equals(key2); + haveKeys |= (desiredIsKey && supportedIsKey); + if (supportedIsKey || supportedLang.equals(ANY)) { + DistanceNode value = e2.getValue(); + ((StringDistanceTable)value.getDistanceTable()).addSubtable(desiredScript, supportedScript, percentage); + } + } + } + } + // now add the sequence explicitly + StringDistanceTable dt = new StringDistanceTable(); + dt.addSubtable(desiredScript, supportedScript, percentage); + CopyIfEmpty r = new CopyIfEmpty(dt); + addSubtables(desiredLang, supportedLang, r); + } + + public void addSubtables(String desiredLang, String supportedLang, + String desiredScript, String supportedScript, + String desiredRegion, String supportedRegion, + int percentage) { + + // add to all the values that have the matching desiredLang and supportedLang + boolean haveKeys = false; + for (Entry> e1 : subtables.entrySet()) { + String key1 = e1.getKey(); + final boolean desiredIsKey = desiredLang.equals(key1); + if (desiredIsKey || desiredLang.equals(ANY)) { + for (Entry e2 : e1.getValue().entrySet()) { + String key2 = e2.getKey(); + final boolean supportedIsKey = supportedLang.equals(key2); + haveKeys |= (desiredIsKey && supportedIsKey); + if (supportedIsKey || supportedLang.equals(ANY)) { + StringDistanceNode value = (StringDistanceNode) e2.getValue(); + ((StringDistanceTable)value.distanceTable).addSubtables(desiredScript, supportedScript, desiredRegion, supportedRegion, percentage); + } + } + } + } + // now add the sequence explicitly + + StringDistanceTable dt = new StringDistanceTable(); + dt.addSubtable(desiredRegion, supportedRegion, percentage); + AddSub r = new AddSub(desiredScript, supportedScript, dt); + addSubtables(desiredLang, supportedLang, r); + } + + @Override + public String toString() { + return toString(false); + } + + @Override + public String toString(boolean abbreviate) { + return toString(abbreviate, "", new IdMakerFull("interner"), new StringBuilder()).toString(); + } + + public StringBuilder toString(boolean abbreviate, String indent, IdMakerFull intern, StringBuilder buffer) { + String indent2 = indent.isEmpty() ? "" : "\t"; + Integer id = abbreviate ? intern.getOldAndAdd(subtables) : null; + if (id != null) { + buffer.append(indent2).append('#').append(id).append('\n'); + } else for (Entry> e1 : subtables.entrySet()) { + final Map subsubtable = e1.getValue(); + buffer.append(indent2).append(e1.getKey()); + String indent3 = "\t"; + id = abbreviate ? intern.getOldAndAdd(subsubtable) : null; + if (id != null) { + buffer.append(indent3).append('#').append(id).append('\n'); + } else for (Entry e2 : subsubtable.entrySet()) { + DistanceNode value = e2.getValue(); + buffer.append(indent3).append(e2.getKey()); + id = abbreviate ? intern.getOldAndAdd(value) : null; + if (id != null) { + buffer.append('\t').append('#').append(id).append('\n'); + } else { + buffer.append('\t').append(value.distance); + final DistanceTable distanceTable = value.getDistanceTable(); + if (distanceTable != null) { + id = abbreviate ? intern.getOldAndAdd(distanceTable) : null; + if (id != null) { + buffer.append('\t').append('#').append(id).append('\n'); + } else { + ((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer); + } + } else { + buffer.append('\n'); + } + } + indent3 = indent+'\t'; + } + indent2 = indent; + } + return buffer; + } + + @Override + public StringDistanceTable compact() { + return new CompactAndImmutablizer().compact(this); + } + + @Override + public Set getCloser(int threshold) { + Set result = new HashSet(); + for (Entry> e1 : subtables.entrySet()) { + String desired = e1.getKey(); + for (Entry e2 : e1.getValue().entrySet()) { + if (e2.getValue().distance < threshold) { + result.add(desired); + break; + } + } + } + return result; + } + + public Integer getInternalDistance(String a, String b) { + Map subsub = subtables.get(a); + if (subsub == null) { + return null; + } + DistanceNode dnode = subsub.get(b); + return dnode == null ? null : dnode.distance; + } + + @Override + public DistanceNode getInternalNode(String a, String b) { + Map subsub = subtables.get(a); + if (subsub == null) { + return null; + } + return subsub.get(b); + } + + @Override + public Map> getInternalMatches() { + Map> result = new LinkedHashMap>(); + for (Entry> entry : subtables.entrySet()) { + result.put(entry.getKey(), new LinkedHashSet(entry.getValue().keySet())); + } + return result; + } + } + + static class CopyIfEmpty implements Predicate { + private final StringDistanceTable toCopy; + CopyIfEmpty(StringDistanceTable resetIfNotNull) { + this.toCopy = resetIfNotNull; + } + @Override + public boolean test(DistanceNode node) { + final StringDistanceTable subtables = (StringDistanceTable) node.getDistanceTable(); + if (subtables.subtables.isEmpty()) { + subtables.copy(toCopy); + } + return true; + } + } + + static class AddSub implements Predicate { + private final String desiredSub; + private final String supportedSub; + private final CopyIfEmpty r; + + AddSub(String desiredSub, String supportedSub, StringDistanceTable distanceTableToCopy) { + this.r = new CopyIfEmpty(distanceTableToCopy); + this.desiredSub = desiredSub; + this.supportedSub = supportedSub; + } + @Override + public boolean test(DistanceNode node) { + if (node == null) { + throw new IllegalArgumentException("bad structure"); + } else { + ((StringDistanceNode)node).addSubtables(desiredSub, supportedSub, r); + } + return true; + } + } + + public int distance(ULocale desired, ULocale supported, int threshold, DistanceOption distanceOption) { + LSR supportedLSR = LSR.fromMaximalized(supported); + LSR desiredLSR = LSR.fromMaximalized(desired); + return distanceRaw(desiredLSR, supportedLSR, threshold, distanceOption); + } + + /** + * Returns distance, from 0 to ABOVE_THRESHOLD. + * ULocales must be in canonical, addLikelySubtags format. Returns distance + * @param desired + * @param supported + * @param distanceOption + * @return + */ + public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) { + return distanceRaw(desired.language, supported.language, + desired.script, supported.script, + desired.region, supported.region, + threshold, distanceOption); + } + + public enum DistanceOption {NORMAL, SCRIPT_FIRST} + + /** + * Returns distance, from 0 to ABOVE_THRESHOLD. + * ULocales must be in canonical, addLikelySubtags format. Returns distance + */ + public int distanceRaw( + String desiredLang, String supportedlang, + String desiredScript, String supportedScript, + String desiredRegion, String supportedRegion, + int threshold, + DistanceOption distanceOption) { + + Output subtable = new Output(); + + int distance = languageDesired2Supported.getDistance(desiredLang, supportedlang, subtable, true); + boolean scriptFirst = distanceOption == DistanceOption.SCRIPT_FIRST; + if (scriptFirst) { + distance >>= 2; + } + if (distance < 0) { + distance = 0; + } else if (distance >= threshold) { + return ABOVE_THRESHOLD; + } + + int scriptDistance = subtable.value.getDistance(desiredScript, supportedScript, subtable, true); + if (scriptFirst) { + scriptDistance >>= 1; + } + distance += scriptDistance; + if (distance >= threshold) { + return ABOVE_THRESHOLD; + } + + if (desiredRegion.equals(supportedRegion)) { + return distance; + } + + // From here on we know the regions are not equal + + final String desiredPartition = regionMapper.toId(desiredRegion); + final String supportedPartition = regionMapper.toId(supportedRegion); + int subdistance; + + // check for macros. If one is found, we take the maximum distance + // this could be optimized by adding some more structure, but probably not worth it. + + Collection desiredPartitions = desiredPartition.isEmpty() ? regionMapper.macroToPartitions.get(desiredRegion) : null; + Collection supportedPartitions = supportedPartition.isEmpty() ? regionMapper.macroToPartitions.get(supportedRegion) : null; + if (desiredPartitions != null || supportedPartitions != null) { + subdistance = 0; + // make the code simple for now + if (desiredPartitions == null) { + desiredPartitions = Collections.singleton(desiredPartition); + } + if (supportedPartitions == null) { + supportedPartitions = Collections.singleton(supportedPartition); + } + + for (String desiredPartition2 : desiredPartitions) { + for (String supportedPartition2 : supportedPartitions) { + int tempSubdistance = subtable.value.getDistance(desiredPartition2, supportedPartition2, null, false); + if (subdistance < tempSubdistance) { + subdistance = tempSubdistance; + } + } + } + } else { + subdistance = subtable.value.getDistance(desiredPartition, supportedPartition, null, false); + } + distance += subdistance; + return distance >= threshold ? ABOVE_THRESHOLD : distance; + } + + + private static final XLocaleDistance DEFAULT; + + public static XLocaleDistance getDefault() { + return DEFAULT; + } + + static { + String[][] variableOverrides = { + {"$enUS", "AS+GU+MH+MP+PR+UM+US+VI"}, + + {"$cnsar", "HK+MO"}, + + {"$americas", "019"}, + + {"$maghreb", "MA+DZ+TN+LY+MR+EH"}, + }; + String[] paradigmRegions = { + "en", "en-GB", "es", "es-419", "pt-BR", "pt-PT" + }; + String[][] regionRuleOverrides = { + {"ar_*_$maghreb", "ar_*_$maghreb", "96"}, + {"ar_*_$!maghreb", "ar_*_$!maghreb", "96"}, + {"ar_*_*", "ar_*_*", "95"}, + + {"en_*_$enUS", "en_*_$enUS", "96"}, + {"en_*_$!enUS", "en_*_$!enUS", "96"}, + {"en_*_*", "en_*_*", "95"}, + + {"es_*_$americas", "es_*_$americas", "96"}, + {"es_*_$!americas", "es_*_$!americas", "96"}, + {"es_*_*", "es_*_*", "95"}, + + {"pt_*_$americas", "pt_*_$americas", "96"}, + {"pt_*_$!americas", "pt_*_$!americas", "96"}, + {"pt_*_*", "pt_*_*", "95"}, + + {"zh_Hant_$cnsar", "zh_Hant_$cnsar", "96"}, + {"zh_Hant_$!cnsar", "zh_Hant_$!cnsar", "96"}, + {"zh_Hant_*", "zh_Hant_*", "95"}, + + {"*_*_*", "*_*_*", "96"}, + }; + + Builder rmb = new RegionMapper.Builder().addParadigms(paradigmRegions); + for (String[] variableRule : variableOverrides) { + rmb.add(variableRule[0], variableRule[1]); + } + if (PRINT_OVERRIDES) { + System.out.println("\t\t"); + System.out.println("\t\t\t"); + for (String[] variableRule : variableOverrides) { + System.out.println("\t\t\t"); + } + } + + final StringDistanceTable defaultDistanceTable = new StringDistanceTable(); + final RegionMapper defaultRegionMapper = rmb.build(); + + Splitter bar = Splitter.on('_'); + + List, List, Integer, Boolean>>[] sorted = new ArrayList[3]; + sorted[0] = new ArrayList, List, Integer, Boolean>>(); + sorted[1] = new ArrayList, List, Integer, Boolean>>(); + sorted[2] = new ArrayList, List, Integer, Boolean>>(); + + // sort the rules so that the language-only are first, then the language-script, and finally the language-script-region. + for (R4 info : xGetLanguageMatcherData()) { + String desiredRaw = info.get0(); + String supportedRaw = info.get1(); + List desired = bar.splitToList(desiredRaw); + List supported = bar.splitToList(supportedRaw); + Boolean oneway = info.get3(); + int distance = desiredRaw.equals("*_*") ? 50 : info.get2(); + int size = desired.size(); + + // for now, skip size == 3 + if (size == 3) continue; + + sorted[size-1].add(Row.of(desired, supported, distance, oneway)); + } + + for (List, List, Integer, Boolean>> item1 : sorted) { + int debug = 0; + for (Row.R4, List, Integer, Boolean> item2 : item1) { + List desired = item2.get0(); + List supported = item2.get1(); + Integer distance = item2.get2(); + Boolean oneway = item2.get3(); + add(defaultDistanceTable, desired, supported, distance); + if (oneway != Boolean.TRUE && !desired.equals(supported)) { + add(defaultDistanceTable, supported, desired, distance); + } + printMatchXml(desired, supported, distance, oneway); + } + } + + // add new size=3 + for (String[] rule : regionRuleOverrides) { + // if (PRINT_OVERRIDES) System.out.println("\t\t\t"); + if (rule[0].equals("en_*_*") || rule[1].equals("*_*_*")) { + int debug = 0; + } + List desiredBase = new ArrayList(bar.splitToList(rule[0])); + List supportedBase = new ArrayList(bar.splitToList(rule[1])); + Integer distance = 100-Integer.parseInt(rule[2]); + printMatchXml(desiredBase, supportedBase, distance, false); + + Collection desiredRegions = defaultRegionMapper.getIdsFromVariable(desiredBase.get(2)); + if (desiredRegions.isEmpty()) { + throw new IllegalArgumentException("Bad region variable: " + desiredBase.get(2)); + } + Collection supportedRegions = defaultRegionMapper.getIdsFromVariable(supportedBase.get(2)); + if (supportedRegions.isEmpty()) { + throw new IllegalArgumentException("Bad region variable: " + supportedBase.get(2)); + } + for (String desiredRegion2 : desiredRegions) { + desiredBase.set(2, desiredRegion2.toString()); // fix later + for (String supportedRegion2 : supportedRegions) { + supportedBase.set(2, supportedRegion2.toString()); // fix later + add(defaultDistanceTable, desiredBase, supportedBase, distance); + add(defaultDistanceTable, supportedBase, desiredBase, distance); + } + } + } + if (PRINT_OVERRIDES) { + System.out.println("\t\t"); + } + + DEFAULT = new XLocaleDistance(defaultDistanceTable.compact(), defaultRegionMapper); + + if (false && PRINT_OVERRIDES) { + System.out.println(defaultRegionMapper); + System.out.println(defaultDistanceTable); + throw new IllegalArgumentException(); + } + } + + private static void printMatchXml(List desired, List supported, Integer distance, Boolean oneway) { + if (PRINT_OVERRIDES) { + String desiredStr = CollectionUtilities.join(desired, "_"); + String supportedStr = CollectionUtilities.join(supported, "_"); + String desiredName = fixedName(desired); + String supportedName = fixedName(supported); + System.out.println("\t\t\t\t"); + } + } + + private static String fixedName(List match) { + List alt = new ArrayList(match); + StringBuilder result = new StringBuilder(); + switch(alt.size()) { + case 3: + String region = alt.get(2); + if (region.equals("*") || region.startsWith("$")) { + result.append(region); + } else { + result.append(english.regionDisplayName(region)); + } + case 2: + String script = alt.get(1); + if (script.equals("*")) { + result.insert(0, script); + } else { + result.insert(0, english.scriptDisplayName(script)); + } + case 1: + String language = alt.get(0); + if (language.equals("*")) { + result.insert(0, language); + } else { + result.insert(0, english.languageDisplayName(language)); + } + } + return CollectionUtilities.join(alt, "; "); + } + + static public void add(StringDistanceTable languageDesired2Supported, List desired, List supported, int percentage) { + int size = desired.size(); + if (size != supported.size() || size < 1 || size > 3) { + throw new IllegalArgumentException(); + } + final String desiredLang = fixAny(desired.get(0)); + final String supportedLang = fixAny(supported.get(0)); + if (size == 1) { + languageDesired2Supported.addSubtable(desiredLang, supportedLang, percentage); + } else { + final String desiredScript = fixAny(desired.get(1)); + final String supportedScript = fixAny(supported.get(1)); + if (size == 2) { + languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, percentage); + } else { + final String desiredRegion = fixAny(desired.get(2)); + final String supportedRegion = fixAny(supported.get(2)); + languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, desiredRegion, supportedRegion, percentage); + } + } + } + + @Override + public String toString() { + return toString(false); + } + + public String toString(boolean abbreviate) { + return regionMapper + "\n" + languageDesired2Supported.toString(abbreviate); + } + + + // public static XLocaleDistance createDefaultInt() { + // IntDistanceTable d = new IntDistanceTable(DEFAULT_DISTANCE_TABLE); + // return new XLocaleDistance(d, DEFAULT_REGION_MAPPER); + // } + + static Set getContainingMacrosFor(Collection input, Set output) { + output.clear(); + for (Entry> entry : CONTAINER_TO_CONTAINED.asMap().entrySet()) { + if (input.containsAll(entry.getValue())) { // example; if all southern Europe are contained, then add S. Europe + output.add(entry.getKey()); + } + } + return output; + } + + static class RegionMapper implements IdMapper { + /** + * Used for processing rules. At the start we have a variable setting like $A1=US+CA+MX. We generate a mapping from $A1 to a set of partitions {P1, P2} + * When we hit a rule that contains a variable, we replace that rule by multiple rules for the partitions. + */ + final Multimap variableToPartition; + /** + * Used for executing the rules. We map a region to a partition before processing. + */ + final Map regionToPartition; + /** + * Used to support es_419 compared to es_AR, etc. + * @param variableToPartitionIn + * @param regionToPartitionIn + */ + final Multimap macroToPartitions; + /** + * Used to get the paradigm region for a cluster, if there is one + */ + final Set paradigms; + + private RegionMapper( + Multimap variableToPartitionIn, + Map regionToPartitionIn, + Multimap macroToPartitionsIn, + Set paradigmsIn) { + variableToPartition = ImmutableMultimap.copyOf(variableToPartitionIn); + regionToPartition = ImmutableMap.copyOf(regionToPartitionIn); + macroToPartitions = ImmutableMultimap.copyOf(macroToPartitionsIn); + paradigms = ImmutableSet.copyOf(paradigmsIn); + } + + @Override + public String toId(String region) { + String result = regionToPartition.get(region); + return result == null ? "" : result; + } + + public Collection getIdsFromVariable(String variable) { + if (variable.equals("*")) { + return Collections.singleton("*"); + } + Collection result = variableToPartition.get(variable); + if (result == null || result.isEmpty()) { + throw new IllegalArgumentException("Variable not defined: " + variable); + } + return result; + } + + public Set regions() { + return regionToPartition.keySet(); + } + + public Set variables() { + return variableToPartition.keySet(); + } + + @Override + public String toString() { + TreeMultimap partitionToVariables = Multimaps.invertFrom(variableToPartition, + TreeMultimap.create()); + TreeMultimap partitionToRegions = TreeMultimap.create(); + for (Entry e : regionToPartition.entrySet()) { + partitionToRegions.put(e.getValue(), e.getKey()); + } + StringBuilder buffer = new StringBuilder(); + buffer.append("Partition ➠ Variables ➠ Regions (final)"); + for (Entry> e : partitionToVariables.asMap().entrySet()) { + buffer.append('\n'); + buffer.append(e.getKey() + "\t" + e.getValue() + "\t" + partitionToRegions.get(e.getKey())); + } + buffer.append("\nMacro ➠ Partitions"); + for (Entry> e : macroToPartitions.asMap().entrySet()) { + buffer.append('\n'); + buffer.append(e.getKey() + "\t" + e.getValue()); + } + + return buffer.toString(); + } + + static class Builder { + final private Multimap regionToRawPartition = TreeMultimap.create(); + final private RegionSet regionSet = new RegionSet(); + final private Set paradigms = new LinkedHashSet(); + + void add(String variable, String barString) { + Set tempRegions = regionSet.parseSet(barString); + + for (String region : tempRegions) { + regionToRawPartition.put(region, variable); + } + + // now add the inverse variable + + Set inverse = regionSet.inverse(); + String inverseVariable = "$!" + variable.substring(1); + for (String region : inverse) { + regionToRawPartition.put(region, inverseVariable); + } + } + + public Builder addParadigms(String... paradigmRegions) { + for (String paradigm : paradigmRegions) { + paradigms.add(new ULocale(paradigm)); + } + return this; + } + + RegionMapper build() { + final IdMakerFull> id = new IdMakerFull>("partition"); + Multimap variableToPartitions = TreeMultimap.create(); + Map regionToPartition = new TreeMap(); + Multimap partitionToRegions = TreeMultimap.create(); + + for (Entry> e : regionToRawPartition.asMap().entrySet()) { + final String region = e.getKey(); + final Collection rawPartition = e.getValue(); + String partition = String.valueOf((char)('α' + id.add(rawPartition))); + + regionToPartition.put(region, partition); + partitionToRegions.put(partition, region); + + for (String variable : rawPartition) { + variableToPartitions.put(variable, partition); + } + } + + // we get a mapping of each macro to the partitions it intersects with + Multimap macroToPartitions = TreeMultimap.create(); + for (Entry> e : CONTAINER_TO_CONTAINED.asMap().entrySet()) { + String macro = e.getKey(); + for (Entry> e2 : partitionToRegions.asMap().entrySet()) { + String partition = e2.getKey(); + if (!Collections.disjoint(e.getValue(), e2.getValue())) { + macroToPartitions.put(macro, partition); + } + } + } + + return new RegionMapper( + variableToPartitions, + regionToPartition, + macroToPartitions, + paradigms); + } + } + } + + /** + * Parses a string of regions like "US+005-BR" and produces a set of resolved regions. + * All macroregions are fully resolved to sets of non-macro regions. + *
Syntax is simple for now: + *
regionSet := region ([-+] region)*
+ * No precedence, so "x+y-y+z" is (((x+y)-y)+z) NOT (x+y)-(y+z) + */ + private static class RegionSet { + private enum Operation {add, remove} + // temporaries used in processing + final private Set tempRegions = new TreeSet(); + private Operation operation = null; + + private Set parseSet(String barString) { + operation = Operation.add; + int last = 0; + tempRegions.clear(); + int i = 0; + for (; i < barString.length(); ++i) { + char c = barString.charAt(i); // UTF16 is ok, since syntax is only ascii + switch(c) { + case '+': + add(barString, last, i); + last = i+1; + operation = Operation.add; + break; + case '-': + add(barString, last, i); + last = i+1; + operation = Operation.remove; + break; + } + } + add(barString, last, i); + return tempRegions; + } + + private Set inverse() { + TreeSet result = new TreeSet(ALL_FINAL_REGIONS); + result.removeAll(tempRegions); + return result; + } + + private void add(String barString, int last, int i) { + if (i > last) { + String region = barString.substring(last,i); + changeSet(operation, region); + } + } + + private void changeSet(Operation operation, String region) { + Collection contained = CONTAINER_TO_CONTAINED_FINAL.get(region); + if (contained != null && !contained.isEmpty()) { + if (Operation.add == operation) { + tempRegions.addAll(contained); + } else { + tempRegions.removeAll(contained); + } + } else if (Operation.add == operation) { + tempRegions.add(region); + } else { + tempRegions.remove(region); + } + } + } + + public static Multimap invertMap(Map map) { + return Multimaps.invertFrom(Multimaps.forMap(map), LinkedHashMultimap.create()); + } + + public Set getParadigms() { + return regionMapper.paradigms; + } + + public int getDefaultLanguageDistance() { + return defaultLanguageDistance; + } + + public int getDefaultScriptDistance() { + return defaultScriptDistance; + } + + public int getDefaultRegionDistance() { + return defaultRegionDistance; + } + + static class CompactAndImmutablizer extends IdMakerFull { + StringDistanceTable compact(StringDistanceTable item) { + if (toId(item) != null) { + return (StringDistanceTable) intern(item); + } + return new StringDistanceTable(compact(item.subtables, 0)); + } + Map compact(Map item, int level) { + if (toId(item) != null) { + return (Map)intern(item); + } + Map copy = new LinkedHashMap(); + for (Entry entry : item.entrySet()) { + T value = entry.getValue(); + if (value instanceof Map) { + copy.put(entry.getKey(), (T)compact((Map)value, level+1)); + } else { + copy.put(entry.getKey(), (T)compact((DistanceNode)value)); + } + } + return ImmutableMap.copyOf(copy); + } + DistanceNode compact(DistanceNode item) { + if (toId(item) != null) { + return (DistanceNode) intern(item); + } + final DistanceTable distanceTable = item.getDistanceTable(); + if (distanceTable == null || distanceTable.isEmpty()) { + return new DistanceNode(item.distance); + } else { + return new StringDistanceNode(item.distance, compact((StringDistanceTable)((StringDistanceNode)item).distanceTable)); + } + } + } + + @Deprecated + public StringDistanceTable internalGetDistanceTable() { + return (StringDistanceTable) languageDesired2Supported; + } + + public static void main(String[] args) { + // for (Entry> entry : containerToContained.asMap().entrySet()) { + // System.out.println(entry.getKey() + "\t⥢" + entry.getValue() + "; " + containerToFinalContained.get(entry.getKey())); + // } + // final Multimap regionToMacros = ImmutableMultimap.copyOf(Multimaps.invertFrom(containerToContained, TreeMultimap.create())); + // for (Entry> entry : regionToMacros.asMap().entrySet()) { + // System.out.println(entry.getKey() + "\t⥤ " + entry.getValue()); + // } + if (PRINT_OVERRIDES) { + System.out.println(getDefault().toString(true)); + } + DistanceTable table = getDefault().languageDesired2Supported; + DistanceTable compactedTable = table.compact(); + if (!table.equals(compactedTable)) { + throw new IllegalArgumentException("Compaction isn't equal"); + } + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java new file mode 100644 index 00000000000..3bd8a16e8d8 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java @@ -0,0 +1,473 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl.locale; + +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import com.ibm.icu.impl.locale.XCldrStub.ImmutableMultimap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; +import com.ibm.icu.impl.locale.XCldrStub.LinkedHashMultimap; +import com.ibm.icu.impl.locale.XCldrStub.Multimap; +import com.ibm.icu.impl.locale.XLikelySubtags.LSR; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceOption; +import com.ibm.icu.util.LocalePriorityList; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; + +/** + * Immutable class that picks best match between user's desired locales and application's supported locales. + * @author markdavis + */ +public class XLocaleMatcher { + private static final LSR UND = new LSR("und","",""); + private static final ULocale UND_LOCALE = new ULocale("und"); + + // normally the default values, but can be set via constructor + + private final XLocaleDistance localeDistance; + private final int thresholdDistance; + private final int demotionPerAdditionalDesiredLocale; + private final DistanceOption distanceOption; + + // built based on application's supported languages in constructor + + private final Map> supportedLanguages; // the locales in the collection are ordered! + private final Set exactSupportedLocales; // the locales in the collection are ordered! + private final ULocale defaultLanguage; + + + public static class Builder { + private Set supportedLanguagesList; + private int thresholdDistance = -1; + private int demotionPerAdditionalDesiredLocale = -1;; + private ULocale defaultLanguage; + private XLocaleDistance localeDistance; + private DistanceOption distanceOption; + /** + * @param languagePriorityList the languagePriorityList to set + * @return + */ + public Builder setSupportedLocales(String languagePriorityList) { + this.supportedLanguagesList = asSet(LocalePriorityList.add(languagePriorityList).build()); + return this; + } + public Builder setSupportedLocales(LocalePriorityList languagePriorityList) { + this.supportedLanguagesList = asSet(languagePriorityList); + return this; + } + public Builder setSupportedLocales(Set languagePriorityList) { + this.supportedLanguagesList = languagePriorityList; + return this; + } + + /** + * @param thresholdDistance the thresholdDistance to set, with -1 = default + * @return + */ + public Builder setThresholdDistance(int thresholdDistance) { + this.thresholdDistance = thresholdDistance; + return this; + } + /** + * @param demotionPerAdditionalDesiredLocale the demotionPerAdditionalDesiredLocale to set, with -1 = default + * @return + */ + public Builder setDemotionPerAdditionalDesiredLocale(int demotionPerAdditionalDesiredLocale) { + this.demotionPerAdditionalDesiredLocale = demotionPerAdditionalDesiredLocale; + return this; + } + + /** + * @param localeDistance the localeDistance to set, with default = XLocaleDistance.getDefault(). + * @return + */ + public Builder setLocaleDistance(XLocaleDistance localeDistance) { + this.localeDistance = localeDistance; + return this; + } + + /** + * Set the default language, with null = default = first supported language + * @param defaultLanguage + * @return + */ + public Builder setDefaultLanguage(ULocale defaultLanguage) { + this.defaultLanguage = defaultLanguage; + return this; + } + + /** + * If true, then the language differences are smaller than than script differences. + * This is used in situations (such as maps) where it is better to fall back to the same script than a similar language. + * @param distanceOption + * @return + */ + public Builder setDistanceOption(DistanceOption distanceOption) { + this.distanceOption = distanceOption; + return this; + } + + public XLocaleMatcher build() { + return new XLocaleMatcher(this); + } + } + + /** + * Returns a builder used in chaining parameters for building a Locale Matcher. + * @return + */ + public static Builder builder() { + return new Builder(); + } + + /** Convenience method */ + public XLocaleMatcher(String supportedLocales) { + this(builder().setSupportedLocales(supportedLocales)); + } + /** Convenience method */ + public XLocaleMatcher(LocalePriorityList supportedLocales) { + this(builder().setSupportedLocales(supportedLocales)); + } + /** Convenience method */ + public XLocaleMatcher(Set supportedLocales) { + this(builder().setSupportedLocales(supportedLocales)); + } + + /** + * Create a locale matcher with the given parameters. + * @param supportedLocales + * @param thresholdDistance + * @param demotionPerAdditionalDesiredLocale + * @param localeDistance + * @param likelySubtags + */ + private XLocaleMatcher(Builder builder) { + localeDistance = builder.localeDistance == null ? XLocaleDistance.getDefault() + : builder.localeDistance; + thresholdDistance = builder.thresholdDistance < 0 ? localeDistance.getDefaultScriptDistance() + : builder.thresholdDistance; + // only do AFTER above are set + Set paradigms = extractLsrSet(localeDistance.getParadigms()); + final Multimap temp2 = extractLsrMap(builder.supportedLanguagesList, paradigms); + supportedLanguages = temp2.asMap(); + exactSupportedLocales = ImmutableSet.copyOf(temp2.values()); + defaultLanguage = builder.defaultLanguage != null ? builder.defaultLanguage + : supportedLanguages.isEmpty() ? null + : supportedLanguages.entrySet().iterator().next().getValue().iterator().next(); // first language + demotionPerAdditionalDesiredLocale = builder.demotionPerAdditionalDesiredLocale < 0 ? localeDistance.getDefaultRegionDistance()+1 + : builder.demotionPerAdditionalDesiredLocale; + distanceOption = builder.distanceOption; + } + + // Result is not immutable! + private Set extractLsrSet(Set languagePriorityList) { + Set result = new LinkedHashSet(); + for (ULocale item : languagePriorityList) { + final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item); + result.add(max); + } + return result; + } + + private Multimap extractLsrMap(Set languagePriorityList, Set priorities) { + Multimap builder = LinkedHashMultimap.create(); + for (ULocale item : languagePriorityList) { + final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item); + builder.put(max, item); + } + if (builder.size() > 1 && priorities != null) { + // for the supported list, we put any priorities before all others, except for the first. + Multimap builder2 = LinkedHashMultimap.create(); + + // copy the long way so the priorities are in the same order as in the original + boolean first = true; + for (Entry> entry : builder.asMap().entrySet()) { + final LSR key = entry.getKey(); + if (first || priorities.contains(key)) { + builder2.putAll(key, entry.getValue()); + first = false; + } + } + // now copy the rest + builder2.putAll(builder); + if (!builder2.equals(builder)) { + throw new IllegalArgumentException(); + } + builder = builder2; + } + return ImmutableMultimap.copyOf(builder); + } + + + /** Convenience method */ + public ULocale getBestMatch(ULocale ulocale) { + return getBestMatch(ulocale, null); + } + /** Convenience method */ + public ULocale getBestMatch(String languageList) { + return getBestMatch(LocalePriorityList.add(languageList).build(), null); + } + /** Convenience method */ + public ULocale getBestMatch(ULocale... locales) { + return getBestMatch(new LinkedHashSet(Arrays.asList(locales)), null); + } + /** Convenience method */ + public ULocale getBestMatch(Set desiredLanguages) { + return getBestMatch(desiredLanguages, null); + } + /** Convenience method */ + public ULocale getBestMatch(LocalePriorityList desiredLanguages) { + return getBestMatch(desiredLanguages, null); + } + /** Convenience method */ + public ULocale getBestMatch(LocalePriorityList desiredLanguages, Output outputBestDesired) { + return getBestMatch(asSet(desiredLanguages), outputBestDesired); + } + + // TODO add LocalePriorityList method asSet() for ordered Set view backed by LocalePriorityList + private static Set asSet(LocalePriorityList languageList) { + Set temp = new LinkedHashSet(); // maintain order + for (ULocale locale : languageList) { + temp.add(locale); + }; + return temp; + } + + /** + * Get the best match between the desired languages and supported languages + * @param desiredLanguages Typically the supplied user's languages, in order of preference, with best first. + * @param outputBestDesired The one of the desired languages that matched best. + * Set to null if the best match was not below the threshold distance. + * @return + */ + public ULocale getBestMatch(Set desiredLanguages, Output outputBestDesired) { + // fast path for singleton + if (desiredLanguages.size() == 1) { + return getBestMatch(desiredLanguages.iterator().next(), outputBestDesired); + } + // TODO produce optimized version for single desired ULocale + Multimap desiredLSRs = extractLsrMap(desiredLanguages,null); + int bestDistance = Integer.MAX_VALUE; + ULocale bestDesiredLocale = null; + Collection bestSupportedLocales = null; + int delta = 0; + mainLoop: + for (final Entry desiredLsrAndLocale : desiredLSRs.entries()) { + // quick check for exact match + ULocale desiredLocale = desiredLsrAndLocale.getValue(); + LSR desiredLSR = desiredLsrAndLocale.getKey(); + if (delta < bestDistance) { + if (exactSupportedLocales.contains(desiredLocale)) { + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; + } + return desiredLocale; + } + // quick check for maximized locale + Collection found = supportedLanguages.get(desiredLSR); + if (found != null) { + // if we find one in the set, return first (lowest). We already know the exact one isn't there. + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; + } + return found.iterator().next(); + } + } + for (final Entry> supportedLsrAndLocale : supportedLanguages.entrySet()) { + int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(), + thresholdDistance, distanceOption); + if (distance < bestDistance) { + bestDistance = distance; + bestDesiredLocale = desiredLocale; + bestSupportedLocales = supportedLsrAndLocale.getValue(); + if (distance == 0) { + break mainLoop; + } + } + } + delta += demotionPerAdditionalDesiredLocale; + } + if (bestDistance >= thresholdDistance) { + if (outputBestDesired != null) { + outputBestDesired.value = null; + } + return defaultLanguage; + } + if (outputBestDesired != null) { + outputBestDesired.value = bestDesiredLocale; + } + // pick exact match if there is one + if (bestSupportedLocales.contains(bestDesiredLocale)) { + return bestDesiredLocale; + } + // otherwise return first supported, combining variants and extensions from bestDesired + return bestSupportedLocales.iterator().next(); + } + + /** + * Get the best match between the desired languages and supported languages + * @param desiredLanguages Typically the supplied user's languages, in order of preference, with best first. + * @param outputBestDesired The one of the desired languages that matched best. + * Set to null if the best match was not below the threshold distance. + * @return + */ + public ULocale getBestMatch(ULocale desiredLocale, Output outputBestDesired) { + int bestDistance = Integer.MAX_VALUE; + ULocale bestDesiredLocale = null; + Collection bestSupportedLocales = null; + + // quick check for exact match, with hack for und + final LSR desiredLSR = desiredLocale.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(desiredLocale); + + if (exactSupportedLocales.contains(desiredLocale)) { + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; + } + return desiredLocale; + } + // quick check for maximized locale + if (distanceOption == DistanceOption.NORMAL) { + Collection found = supportedLanguages.get(desiredLSR); + if (found != null) { + // if we find one in the set, return first (lowest). We already know the exact one isn't there. + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; + } + return found.iterator().next(); + } + } + for (final Entry> supportedLsrAndLocale : supportedLanguages.entrySet()) { + int distance = localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(), + thresholdDistance, distanceOption); + if (distance < bestDistance) { + bestDistance = distance; + bestDesiredLocale = desiredLocale; + bestSupportedLocales = supportedLsrAndLocale.getValue(); + if (distance == 0) { + break; + } + } + } + if (bestDistance >= thresholdDistance) { + if (outputBestDesired != null) { + outputBestDesired.value = null; + } + return defaultLanguage; + } + if (outputBestDesired != null) { + outputBestDesired.value = bestDesiredLocale; + } + // pick exact match if there is one + if (bestSupportedLocales.contains(bestDesiredLocale)) { + return bestDesiredLocale; + } + // otherwise return first supported, combining variants and extensions from bestDesired + return bestSupportedLocales.iterator().next(); + } + + /** Combine features of the desired locale into those of the supported, and return result. */ + public static ULocale combine(ULocale bestSupported, ULocale bestDesired) { + // for examples of extensions, variants, see + // http://unicode.org/repos/cldr/tags/latest/common/bcp47/ + // http://unicode.org/repos/cldr/tags/latest/common/validity/variant.xml + + if (!bestSupported.equals(bestDesired) && bestDesired != null) { + // add region, variants, extensions + ULocale.Builder b = new ULocale.Builder().setLocale(bestSupported); + + // copy the region from the desired, if there is one + String region = bestDesired.getCountry(); + if (!region.isEmpty()) { + b.setRegion(region); + } + + // copy the variants from desired, if there is one + // note that this will override any subvariants. Eg "sco-ulster-fonipa" + "…-fonupa" => "sco-fonupa" (nuking ulster) + String variants = bestDesired.getVariant(); + if (!variants.isEmpty()) { + b.setVariant(variants); + } + + // copy the extensions from desired, if there are any + // note that this will override any subkeys. Eg "th-u-nu-latn-ca-buddhist" + "…-u-nu-native" => "th-u-nu-native" (nuking calendar) + for (char extensionKey : bestDesired.getExtensionKeys()) { + b.setExtension(extensionKey, bestDesired.getExtension(extensionKey)); + } + bestSupported = b.build(); + } + return bestSupported; + } + + /** Returns the distance between the two languages. The values are not necessarily symmetric. + * @param desired A locale desired by the user + * @param supported A locale supported by a program. + * @return A return of 0 is a complete match, and 100 is a failure case (above the thresholdDistance). + * A language is first maximized with add likely subtags, then compared. + */ + public int distance(ULocale desired, ULocale supported) { + return localeDistance.distanceRaw( + LSR.fromMaximalized(desired), + LSR.fromMaximalized(supported), thresholdDistance, distanceOption); + } + + /** Convenience method */ + public int distance(String desiredLanguage, String supportedLanguage) { + return localeDistance.distanceRaw( + LSR.fromMaximalized(new ULocale(desiredLanguage)), + LSR.fromMaximalized(new ULocale(supportedLanguage)), + thresholdDistance, distanceOption); + } + + @Override + public String toString() { + return exactSupportedLocales.toString(); + } + + /** Return the inverse of the distance: that is, 1-distance(desired, supported) */ + public double match(ULocale desired, ULocale supported) { + return (100-distance(desired, supported))/100.0; + } + + /** + * Returns a fraction between 0 and 1, where 1 means that the languages are a + * perfect match, and 0 means that they are completely different. This is (100-distance(desired, supported))/100.0. + *
Note that + * the precise values may change over time; no code should be made dependent + * on the values remaining constant. + * @param desired Desired locale + * @param desiredMax Maximized locale (using likely subtags) + * @param supported Supported locale + * @param supportedMax Maximized locale (using likely subtags) + * @return value between 0 and 1, inclusive. + * @deprecated Use the form with 2 parameters instead. + */ + @Deprecated + public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) { + return match(desired, supported); + } + + /** + * Canonicalize a locale (language). Note that for now, it is canonicalizing + * according to CLDR conventions (he vs iw, etc), since that is what is needed + * for likelySubtags. + * @param ulocale language/locale code + * @return ULocale with remapped subtags. + * @stable ICU 4.4 + */ + public ULocale canonicalize(ULocale ulocale) { + // TODO + return null; + } + + /** + * @return the thresholdDistance. Any distance above this value is treated as a match failure. + */ + public int getThresholdDistance() { + return thresholdDistance; + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java index 1a53cbbcffd..0b39b16083f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java @@ -25,19 +25,22 @@ import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R3; import com.ibm.icu.impl.Utility; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceOption; +import com.ibm.icu.impl.locale.XLocaleMatcher; +import com.ibm.icu.impl.locale.XLocaleMatcher.Builder; /** * Provides a way to match the languages (locales) supported by a product to the * languages (locales) acceptable to a user, and get the best match. For * example: - * + * *
  * LocaleMatcher matcher = new LocaleMatcher("fr, en-GB, en");
- * 
+ *
  * // afterwards:
  * matcher.getBestMatch("en-US").toLanguageTag() => "en"
  * 
- * + * * It takes into account when languages are close to one another, such as fil * and tl, and when language regional variants are close, like en-GB and en-AU. * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test @@ -46,7 +49,7 @@ import com.ibm.icu.impl.Utility; * product will just need one static instance, built with the languages * that it supports. However, it may want multiple instances with different * default languages based on additional information, such as the domain. - * + * * @author markdavis@google.com * @stable ICU 4.4 */ @@ -83,7 +86,7 @@ public class LocaleMatcher { * threshold, that default language is chosen. Typically the default is English, * but it could be different based on additional information, such as the domain * of the page. - * + * * @param languagePriorityList weighted list * @stable ICU 4.4 */ @@ -94,7 +97,7 @@ public class LocaleMatcher { /** * Create a new language matcher from a String form. The highest-weighted * language is the default. - * + * * @param languagePriorityListString String form of LanguagePriorityList * @stable ICU 4.4 */ @@ -124,6 +127,7 @@ public class LocaleMatcher { @Deprecated public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData, double threshold) { this.matcherData = matcherData == null ? defaultWritten : matcherData.freeze(); + this.languagePriorityList = languagePriorityList; for (final ULocale language : languagePriorityList) { add(language, languagePriorityList.getWeight(language)); } @@ -179,7 +183,7 @@ public class LocaleMatcher { /** * Get the best match for a LanguagePriorityList - * + * * @param languageList list to match * @return best matching language code * @stable ICU 4.4 @@ -206,7 +210,7 @@ public class LocaleMatcher { /** * Convenience method: Get the best match for a LanguagePriorityList - * + * * @param languageList String form of language priority list * @return best matching language code * @stable ICU 4.4 @@ -217,7 +221,7 @@ public class LocaleMatcher { /** * Get the best match for an individual language code. - * + * * @param ulocale locale/language code to match * @return best matching language code * @stable ICU 4.4 @@ -241,14 +245,14 @@ public class LocaleMatcher { */ @Override public String toString() { - return "{" + defaultLanguage + ", " + return "{" + defaultLanguage + ", " + localeToMaxLocaleAndWeight + "}"; } // ================= Privates ===================== /** * Get the best match for an individual language code. - * + * * @param languageCode * @return best matching language code and weight (as per * {@link #match(ULocale, ULocale)}) @@ -291,9 +295,9 @@ public class LocaleMatcher { } return bestTableMatch; } - + /** - * @internal + * @internal * @deprecated This API is ICU internal only. */ @Deprecated @@ -309,7 +313,7 @@ public class LocaleMatcher { } /** - * We preprocess the data to get just the possible matches for each desired base language. + * We preprocess the data to get just the possible matches for each desired base language. */ private void processMapping() { for (Entry> desiredToMatchingLanguages : matcherData.matchingLanguages().keyValuesSet()) { @@ -343,7 +347,7 @@ public class LocaleMatcher { } Set> localeToMaxLocaleAndWeight = new LinkedHashSet>(); - Map>> desiredLanguageToPossibleLocalesToMaxLocaleToData + Map>> desiredLanguageToPossibleLocalesToMaxLocaleToData = new LinkedHashMap>>(); // =============== Special Mapping Information ============== @@ -444,6 +448,7 @@ public class LocaleMatcher { return (region == null ? "*" : region); } + @Override public String toString() { String result = getLanguage(); if (level != Level.language) { @@ -487,7 +492,7 @@ public class LocaleMatcher { enum Level { language(0.99), - script(0.2), + script(0.2), region(0.04); final double worst; @@ -527,7 +532,7 @@ public class LocaleMatcher { } } - double getScore(ULocale dMax, String desiredRaw, String desiredMax, + double getScore(ULocale dMax, String desiredRaw, String desiredMax, ULocale sMax, String supportedRaw, String supportedMax) { double distance = 0; if (!desiredMax.equals(supportedMax)) { @@ -543,7 +548,7 @@ public class LocaleMatcher { System.out.println("\t\t\t" + level + " Raw Score:\t" + desiredLocale + ";\t" + supportedLocale); } for (R3 datum : scores) { // : result - if (datum.get0().matches(desiredLocale) + if (datum.get0().matches(desiredLocale) && datum.get1().matches(supportedLocale)) { if (DEBUG) { System.out.println("\t\t\t\tFOUND\t" + datum); @@ -557,6 +562,7 @@ public class LocaleMatcher { return level.worst; } + @Override public String toString() { StringBuilder result = new StringBuilder().append(level); for (R3 score : scores) { @@ -566,6 +572,7 @@ public class LocaleMatcher { } + @Override @SuppressWarnings("unchecked") public ScoreData cloneAsThawed() { try { @@ -581,10 +588,12 @@ public class LocaleMatcher { private volatile boolean frozen = false; + @Override public ScoreData freeze() { return this; } + @Override public boolean isFrozen() { return frozen; } @@ -638,6 +647,7 @@ public class LocaleMatcher { * @internal * @deprecated This API is ICU internal only. */ + @Override @Deprecated public String toString() { return languageScores + "\n\t" + scriptScores + "\n\t" + regionScores; @@ -746,11 +756,12 @@ public class LocaleMatcher { return this; } - /** + /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ + @Override @Deprecated public LanguageMatcherData cloneAsThawed() { LanguageMatcherData result; @@ -766,11 +777,12 @@ public class LocaleMatcher { } } - /** + /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ + @Override @Deprecated public LanguageMatcherData freeze() { languageScores.freeze(); @@ -781,11 +793,12 @@ public class LocaleMatcher { return this; } - /** + /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ + @Override @Deprecated public boolean isFrozen() { return frozen; @@ -793,6 +806,7 @@ public class LocaleMatcher { } LanguageMatcherData matcherData; + LocalePriorityList languagePriorityList; private static final LanguageMatcherData defaultWritten; @@ -845,4 +859,84 @@ public class LocaleMatcher { final LocaleMatcher matcher = new LocaleMatcher(""); return matcher.match(a, matcher.addLikelySubtags(a), b, matcher.addLikelySubtags(b)); } + + transient XLocaleMatcher xLocaleMatcher = null; + transient ULocale xDefaultLanguage = null; + transient boolean xFavorScript = false; + + /* + * Returns the distance between the two languages, using the new CLDR syntax (see getBestMatch). + * The values are not necessarily symmetric. + * @param desired A locale desired by the user + * @param supported A locale supported by a program. + * @return A return of 0 is a complete match, and 100 is a complete mismatch (above the thresholdDistance). + * A language is first maximized with add likely subtags, then compared. + * @internal + * @deprecated ICU 59: This API is a technical preview. It may change in an upcoming release. + */ + @Deprecated + public int distance(ULocale desired, ULocale supported) { + return getLocaleMatcher().distance(desired, supported); + } + + private synchronized XLocaleMatcher getLocaleMatcher() { + if (xLocaleMatcher == null) { + Builder builder = XLocaleMatcher.builder(); + builder.setSupportedLocales(languagePriorityList); + if (xDefaultLanguage != null) { + builder.setDefaultLanguage(xDefaultLanguage); + } + if (xFavorScript) { + builder.setDistanceOption(DistanceOption.SCRIPT_FIRST); + } + xLocaleMatcher = builder.build(); + } + return xLocaleMatcher; + } + + /** + * Get the best match between the desired languages and supported languages + * This supports the new CLDR syntax to provide for better matches within + * regional clusters (such as maghreb Arabic vs non-maghreb Arabic, or regions that use en-GB vs en-US) + * and also matching between regions and macroregions, such as comparing es-419 to es-AR). + * @param desiredLanguages Typically the supplied user's languages, in order of preference, with best first. + * @param outputBestDesired The one of the desired languages that matched best. + * Set to null if the best match was not below the threshold distance. + * @return best-match supported language + * @internal + * @deprecated ICU 59: This API is a technical preview. It may change in an upcoming release. + */ + @Deprecated + public ULocale getBestMatch(LinkedHashSet desiredLanguages, Output outputBestDesired) { + return getLocaleMatcher().getBestMatch(desiredLanguages, outputBestDesired); + } + + /** + * Set the default language, with null = default = first supported language + * @param defaultLanguage Language to use in case the threshold for distance is exceeded. + * @return this, for chaining + * @internal + * @deprecated ICU 59: This API is a technical preview. It may change in an upcoming release. + */ + @Deprecated + public synchronized LocaleMatcher setDefaultLanguage(ULocale defaultLanguage) { + this.xDefaultLanguage = defaultLanguage; + xLocaleMatcher = null; + return this; + } + + /** + * If true, then the language differences are smaller than than script differences. + * This is used in situations (such as maps) where it is better to fall back to the same script than a similar language. + * @param favorScript Set to true to treat script as most important. + * @return this, for chaining. + * @internal + * @deprecated ICU 59: This API is a technical preview. It may change in an upcoming release. + */ + @Deprecated + public synchronized LocaleMatcher setFavorScript(boolean favorScript) { + this.xFavorScript = favorScript; + xLocaleMatcher = null; + return this; + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/DataDrivenTestHelper.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/DataDrivenTestHelper.java new file mode 100644 index 00000000000..308be8c34e7 --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/DataDrivenTestHelper.java @@ -0,0 +1,187 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.dev.test.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import com.ibm.icu.dev.test.AbstractTestLog; +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.impl.locale.XCldrStub.FileUtilities; +import com.ibm.icu.impl.locale.XCldrStub.Splitter; +import com.ibm.icu.util.ICUUncheckedIOException; + +abstract public class DataDrivenTestHelper { + + public static final List DEBUG_LINE = Collections.singletonList("@debug"); + public static final Splitter SEMICOLON = Splitter.on(';').trimResults(); + public static final Splitter EQUAL_SPLIT = Splitter.on('=').trimResults(); + public static final String SEPARATOR = " ; \t"; + + protected TestFmwk framework = null; + protected int minArgumentCount = 3; + protected int maxArgumentCount = 4; + private List> lines = new ArrayList>(); + private List comments = new ArrayList(); + + public DataDrivenTestHelper setFramework(TestFmwk testFramework) { + this.framework = testFramework; + return this; + } + + public T appendLines(T out) { + try { + for (int i = 0; i < lines.size(); ++i) { + List components = lines.get(i); + String comment = comments.get(i); + if (components.isEmpty()) { + if(!comment.isEmpty()) { + out.append("# ").append(comment); + } + } else { + String first = components.iterator().next(); + String sep = first.startsWith("@") ? "=" : SEPARATOR; + out.append(CollectionUtilities.join(components, sep)); + if (!comment.isEmpty()) { + out.append("\t# ").append(comment); + } + } + out.append('\n'); + } + return out; + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } + } + + protected DataDrivenTestHelper addLine(List arguments, String commentBase) { + lines.add(Collections.unmodifiableList(arguments)); + comments.add(commentBase); + return this; + } + + public DataDrivenTestHelper run(Class classFileIsRelativeTo, String file) { + return load(classFileIsRelativeTo, file) + .test(); + } + + public boolean isTestLine(List arguments) { + return !arguments.isEmpty() && !arguments.equals(DEBUG_LINE); + } + + public DataDrivenTestHelper test() { + boolean breakpoint = false; + for (int i = 0; i < lines.size(); ++i) { + List arguments = lines.get(i); + String comment = comments.get(i); + if (arguments.isEmpty()) { + if (!comment.isEmpty()) { + AbstractTestLog.logln(comment); + } + continue; + } else if (arguments.equals(DEBUG_LINE)) { + breakpoint = true; + continue; + } else { + String first = arguments.get(0); + if (first.startsWith("@")) { + handleParams(comment, arguments); + continue; + } + } + try { + handle(i, breakpoint, comment, arguments); + } catch (Exception e) { + e.printStackTrace(); + AbstractTestLog.errln("Illegal data test file entry (" + i + "): " + arguments + " # " + comment); + } + breakpoint = false; + } + return this; + } + + public DataDrivenTestHelper load(Class classFileIsRelativeTo, String file) { + BufferedReader in = null; + try { + in = FileUtilities.openFile(classFileIsRelativeTo, file); + //boolean breakpoint = false; + + while (true) { + String line = in.readLine(); + if (line == null) { + break; + } + line = line.trim(); + if (line.isEmpty()) { + addLine(Collections.emptyList(), ""); + continue; + } + int hash = line.indexOf('#'); + String comment = ""; + String commentBase = ""; + if (hash >= 0) { + commentBase = line.substring(hash+1).trim(); + line = line.substring(0,hash).trim(); + comment = "# " + commentBase; + if (!line.isEmpty()) { + comment = "\t" + comment; + } + } + if (line.isEmpty()) { + addLine(Collections.emptyList(), commentBase); + continue; + } + if (line.startsWith("@")) { + List keyValue = EQUAL_SPLIT.splitToList(line); + addLine(keyValue, comment); + continue; + } + List arguments = SEMICOLON.splitToList(line); + if (arguments.size() < minArgumentCount || arguments.size() > maxArgumentCount) { + AbstractTestLog.errln("Malformed data line:" + line + comment); + continue; + } + addLine(arguments, commentBase); + } + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } finally { + if (in != null) { + try { + in.close(); + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } + } + } + lines = Collections.unmodifiableList(lines); // should do deep unmodifiable... + comments = Collections.unmodifiableList(comments); + return this; + } + + protected boolean assertEquals(String message, Object expected, Object actual) { + return TestFmwk.handleAssert(Objects.equals(expected, actual), message, stringFor(expected), stringFor(actual), null, false); + } + + private final String stringFor(Object obj) { + return obj == null ? "null" + : obj instanceof String ? "\"" + obj + '"' + : obj instanceof Number ? String.valueOf(obj) + : obj.getClass().getName() + "<" + obj + ">"; + } + + abstract public void handle(int lineNumber, boolean breakpoint, String commentBase, List arguments); + + public void handleParams(String comment, List arguments) { + throw new IllegalArgumentException("Unrecognized parameter: " + arguments); + } + + public List> getLines() { + return lines; + } +} \ No newline at end of file diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java index f8f24908df2..9c82ba59d59 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java @@ -9,6 +9,8 @@ package com.ibm.icu.dev.test.util; +import java.util.Arrays; +import java.util.LinkedHashSet; import java.util.Set; import java.util.TreeSet; @@ -18,11 +20,12 @@ import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.util.LocaleMatcher; import com.ibm.icu.util.LocaleMatcher.LanguageMatcherData; import com.ibm.icu.util.LocalePriorityList; +import com.ibm.icu.util.Output; import com.ibm.icu.util.ULocale; /** * Test the LocaleMatcher. - * + * * @author markdavis */ @SuppressWarnings("deprecation") @@ -490,7 +493,7 @@ public class LocaleMatcherTest extends TestFmwk { LocaleMatcher matcher; matcher = new LocaleMatcher("mul, nl"); assertEquals("nl", matcher.getBestMatch("af").toString()); // af => nl - + matcher = new LocaleMatcher("mul, af"); assertEquals("mul", matcher.getBestMatch("nl").toString()); // but nl !=> af } @@ -618,7 +621,7 @@ public class LocaleMatcherTest extends TestFmwk { } } - private long timeLocaleMatcher(String title, String desired, LocaleMatcher matcher, + private long timeLocaleMatcher(String title, String desired, LocaleMatcher matcher, boolean showmessage, int iterations, long comparisonTime) { long start = System.nanoTime(); for (int i = iterations; i > 0; --i) { @@ -629,11 +632,36 @@ public class LocaleMatcherTest extends TestFmwk { + (comparisonTime > 0 ? (delta * 100 / comparisonTime - 100) + "% longer" : "")); return delta; } - + @Test public void Test8288() { final LocaleMatcher matcher = newLocaleMatcher("it, en"); assertEquals("it", matcher.getBestMatch("und").toString()); assertEquals("en", matcher.getBestMatch("und, en").toString()); } + + @Test + public void TestTechPreview() { + final LocaleMatcher matcher = newLocaleMatcher("it, en, ru"); + ULocale und = new ULocale("und"); + ULocale bulgarian = new ULocale("bg"); + ULocale russian = new ULocale("ru"); + + assertEquals("es-419/MX", 4, matcher.distance(new ULocale("es","419"), new ULocale("es","MX"))); + assertEquals("es-ES/DE", 4, matcher.distance(new ULocale("es","DE"), new ULocale("es","ES"))); + + Output outputBestDesired = new Output(); + + ULocale best = matcher.getBestMatch(new LinkedHashSet(Arrays.asList(und, ULocale.GERMAN)), outputBestDesired); + assertEquals(ULocale.ITALIAN, best); + assertEquals(null, outputBestDesired.value); + + matcher.setDefaultLanguage(ULocale.JAPANESE); + best = matcher.getBestMatch(new LinkedHashSet(Arrays.asList(und, ULocale.GERMAN)), outputBestDesired); + assertEquals(ULocale.JAPANESE, best); + + matcher.setFavorScript(true); + best = matcher.getBestMatch(new LinkedHashSet(Arrays.asList(und, bulgarian)), outputBestDesired); + assertEquals(russian, best); + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java new file mode 100644 index 00000000000..a1cb2085c4d --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java @@ -0,0 +1,206 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.dev.test.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.junit.Test; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.impl.locale.XLikelySubtags.LSR; +import com.ibm.icu.impl.locale.XLocaleDistance; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceNode; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceOption; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceTable; +import com.ibm.icu.util.LocaleMatcher; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; + +/** + * Test the XLocaleDistance. + * + * @author markdavis + */ +public class XLocaleDistanceTest extends TestFmwk { + private static final boolean REFORMAT = false; // set to true to get a reformatted data file listed + + public static final int FAIL = XLocaleDistance.ABOVE_THRESHOLD; + + private XLocaleDistance localeMatcher = XLocaleDistance.getDefault(); + DataDrivenTestHelper tfh = new MyTestFileHandler() + .setFramework(this) + .load(XLocaleDistanceTest.class, "data/localeDistanceTest.txt"); + + static class Arguments { + final ULocale desired; + final ULocale supported; + final int desiredToSupported; + final int supportedToDesired; + + public Arguments(List args) { + this.desired = new ULocale.Builder().setLanguageTag(args.get(0)).build(); // use more complicated expression to check syntax + this.supported = new ULocale.Builder().setLanguageTag(args.get(1)).build(); + this.desiredToSupported = Integer.parseInt(args.get(2)); + this.supportedToDesired = args.size() > 3 ? Integer.parseInt(args.get(3)) : this.desiredToSupported; + } + } + + @Test + public void testTiming() { + List testArgs = new ArrayList(); + for (List line : tfh.getLines()) { + if (tfh.isTestLine(line)) { + testArgs.add(new Arguments(line)); + } + } + Arguments[] tests = testArgs.toArray(new Arguments[testArgs.size()]); + + final LocaleMatcher oldLocaleMatcher = new LocaleMatcher(""); + + long likelyTime = 0; + long newLikelyTime = 0; + long newTimeMinusLikely = 0; + //long intTime = 0; + long oldTimeMinusLikely = 0; + final int maxIterations = 1000; + + for (int iterations = maxIterations; iterations > 0; --iterations) { + // int count=0; + for (Arguments test : tests) { + final ULocale desired = test.desired; + final ULocale supported = test.supported; + //final int desiredToSupported = test.desiredToSupported; + //final int supportedToDesired = test.supportedToDesired; + + long temp = System.nanoTime(); + final ULocale desiredMax = ULocale.addLikelySubtags(desired); + final ULocale supportedMax = ULocale.addLikelySubtags(supported); + likelyTime += System.nanoTime()-temp; + + temp = System.nanoTime(); + //double distOld1 = oldLocaleMatcher.match(desired, desiredMax, supported, supportedMax); + //double distOld2 = oldLocaleMatcher.match(supported, supportedMax, desired, desiredMax); + oldTimeMinusLikely += System.nanoTime()-temp; + + temp = System.nanoTime(); + final LSR desiredLSR = LSR.fromMaximalized(desired); + final LSR supportedLSR = LSR.fromMaximalized(supported); + newLikelyTime += System.nanoTime()-temp; + + temp = System.nanoTime(); + int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL); + int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL); + newTimeMinusLikely += System.nanoTime()-temp; + } + } + final long oldTime = oldTimeMinusLikely+likelyTime; + final long newTime = newLikelyTime+newTimeMinusLikely; + logln("\n"); + logln("\tlikelyTime:\t" + likelyTime/maxIterations); + logln("\toldTime-likelyTime:\t" + oldTimeMinusLikely/maxIterations); + logln("totalOld:\t" + oldTime/maxIterations); + logln("\tnewLikelyTime:\t" + newLikelyTime/maxIterations); + logln("totalNew:\t" + newTime/maxIterations); + assertTrue("newTime < 20% of oldTime", newTime * 5 < oldTime); + //logln("\tnewIntTime-newLikelyTime-extractTime:\t" + intTime/maxIterations); + //logln("totalInt:\t" + (intTime)/maxIterations); + } + + @Test + @SuppressWarnings("deprecation") + public void testInternalTable() { + checkTables(localeMatcher.internalGetDistanceTable(), "", 1); + } + + @SuppressWarnings("deprecation") + private void checkTables(DistanceTable internalGetDistanceTable, String title, int depth) { + // Check that ANY, ANY is always present, and that the table has a depth of exactly 3 everyplace. + Map> matches = internalGetDistanceTable.getInternalMatches(); + + // must have ANY,ANY + boolean haveANYANY = false; + for (Entry> entry : matches.entrySet()) { + String first = entry.getKey(); + boolean haveANYfirst = first.equals(XLocaleDistance.ANY); + for (String second : entry.getValue()) { + haveANYANY |= haveANYfirst && second.equals(XLocaleDistance.ANY); + DistanceNode distanceNode = internalGetDistanceTable.getInternalNode(first, second); + DistanceTable subDistanceTable = distanceNode.getDistanceTable(); + if (subDistanceTable == null || subDistanceTable.isEmpty()) { + if (depth != 3) { + logln("depth should be 3"); + } + if (distanceNode.getClass() != DistanceNode.class) { + logln("should be plain DistanceNode"); + } + } else { + if (depth >= 3) { + logln("depth should be ≤ 3"); + } + if (distanceNode.getClass() == DistanceNode.class) { + logln("should NOT be plain DistanceNode"); + } + checkTables(subDistanceTable, first + "," + second + ",", depth+1); + } + } + } + if (!haveANYANY) { + logln("ANY-ANY not in" + matches); + } + } + + @Test + public void testShowDistanceTable() { + if (isVerbose()) { + System.out.println(XLocaleDistance.getDefault().toString(false)); + } + } + + @Test + public void testDataDriven() throws IOException { + tfh.test(); + if (REFORMAT) { + System.out.println(tfh.appendLines(new StringBuffer())); + } + } + + class MyTestFileHandler extends DataDrivenTestHelper { + final XLocaleDistance distance = XLocaleDistance.getDefault(); + Output bestDesired = new Output(); + private DistanceOption distanceOption = DistanceOption.NORMAL; + private Integer threshold = distance.getDefaultScriptDistance(); + + @Override + public void handle(int lineNumber, boolean breakpoint, String commentBase, List arguments) { + if (breakpoint) { + breakpoint = false; // put debugger breakpoint here to break at @debug in test file + } + Arguments args = new Arguments(arguments); + int supportedToDesiredActual = distance.distance(args.supported, args.desired, threshold, distanceOption); + int desiredToSupportedActual = distance.distance(args.desired, args.supported, threshold, distanceOption); + String desiredTag = args.desired.toLanguageTag(); + String supportedTag = args.supported.toLanguageTag(); + final String comment = commentBase.isEmpty() ? "" : "\t# " + commentBase; + if (assertEquals("(" + lineNumber + ") " + desiredTag + " to " + supportedTag + comment, args.desiredToSupported, desiredToSupportedActual)) { + assertEquals("(" + lineNumber + ") " + supportedTag + " to " + desiredTag + comment, args.supportedToDesired, supportedToDesiredActual); + } + } + @Override + public void handleParams(String comment, List arguments) { + String switchArg = arguments.get(0); + if (switchArg.equals("@DistanceOption")) { + distanceOption = DistanceOption.valueOf(arguments.get(1)); + } else if (switchArg.equals("@Threshold")) { + threshold = Integer.valueOf(arguments.get(1)); + } else { + super.handleParams(comment, arguments); + } + return; + } + } +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java new file mode 100644 index 00000000000..8e3b083fa88 --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java @@ -0,0 +1,334 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.dev.test.util; + + +import java.io.IOException; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import org.junit.Test; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.impl.locale.XCldrStub.Joiner; +import com.ibm.icu.impl.locale.XCldrStub.Splitter; +import com.ibm.icu.impl.locale.XLocaleDistance; +import com.ibm.icu.impl.locale.XLocaleDistance.DistanceOption; +import com.ibm.icu.impl.locale.XLocaleMatcher; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.LocaleMatcher; +import com.ibm.icu.util.LocalePriorityList; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; + +/** + * Test the XLocaleMatcher. + * + * @author markdavis + */ +public class XLocaleMatcherTest extends TestFmwk { + private static final boolean REFORMAT = false; // set to true to get a reformatted data file listed + + private static final int REGION_DISTANCE = 4; + + private static final XLocaleDistance LANGUAGE_MATCHER_DATA = XLocaleDistance.getDefault(); + + private XLocaleMatcher newXLocaleMatcher() { + return new XLocaleMatcher(""); + } + + private XLocaleMatcher newXLocaleMatcher(LocalePriorityList build) { + return new XLocaleMatcher(build); + } + + private XLocaleMatcher newXLocaleMatcher(String string) { + return new XLocaleMatcher(LocalePriorityList.add(string).build()); + } + + private XLocaleMatcher newXLocaleMatcher(LocalePriorityList string, int d) { + return XLocaleMatcher.builder().setSupportedLocales(string).setThresholdDistance(d).build(); + } + + private XLocaleMatcher newXLocaleMatcher(LocalePriorityList string, int d, DistanceOption distanceOption) { + return XLocaleMatcher + .builder() + .setSupportedLocales(string) + .setThresholdDistance(d) + .setDistanceOption(distanceOption) + .build(); + } + + // public void testParentLocales() { + // // find all the regions that have a closer relation because of an explicit parent + // Set explicitParents = new HashSet<>(INFO.getExplicitParents()); + // explicitParents.remove("root"); + // Set otherParents = new HashSet<>(INFO.getExplicitParents()); + // for (String locale : explicitParents) { + // while (true) { + // locale = LocaleIDParser.getParent(locale); + // if (locale == null || locale.equals("root")) { + // break; + // } + // otherParents.add(locale); + // } + // } + // otherParents.remove("root"); + // + // for (String locale : CONFIG.getCldrFactory().getAvailable()) { + // String parentId = LocaleIDParser.getParent(locale); + // String parentIdSimple = LocaleIDParser.getSimpleParent(locale); + // if (!explicitParents.contains(parentId) && !otherParents.contains(parentIdSimple)) { + // continue; + // } + // System.out.println(locale + "\t" + CONFIG.getEnglish().getName(locale) + "\t" + parentId + "\t" + parentIdSimple); + // } + // } + + +// TBD reenable with override data +// public void testOverrideData() { +// double threshold = 0.05; +// XLocaleDistance XLocaleMatcherData = new XLocaleDistance() +// .addDistance("br", "fr", 10, true) +// .addDistance("es", "cy", 10, true); +// logln(XLocaleMatcherData.toString()); +// +// final XLocaleMatcher matcher = newXLocaleMatcher( +// LocalePriorityList +// .add(ULocale.ENGLISH) +// .add(ULocale.FRENCH) +// .add(ULocale.UK) +// .build(), XLocaleMatcherData, threshold); +// logln(matcher.toString()); +// +// assertEquals(ULocale.FRENCH, matcher.getBestMatch(new ULocale("br"))); +// assertEquals(ULocale.ENGLISH, matcher.getBestMatch(new ULocale("es"))); // one +// // way +// } + + + private void assertEquals(Object expected, Object string) { + assertEquals("", expected, string); + } + + /** + * If all the base languages are the same, then each sublocale matches + * itself most closely + */ + @Test + public void testExactMatches() { + String lastBase = ""; + TreeSet sorted = new TreeSet(); + for (ULocale loc : ULocale.getAvailableLocales()) { + String language = loc.getLanguage(); + if (!lastBase.equals(language)) { + check(sorted); + sorted.clear(); + lastBase = language; + } + sorted.add(loc); + } + check(sorted); + } + + private void check(Set sorted) { + if (sorted.isEmpty()) { + return; + } + check2(sorted); + ULocale first = sorted.iterator().next(); + ULocale max = ULocale.addLikelySubtags(first); + sorted.add(max); + check2(sorted); + } + + /** + * @param sorted + */ + private void check2(Set sorted) { + // TODO Auto-generated method stub + logln("Checking: " + sorted); + XLocaleMatcher matcher = newXLocaleMatcher( + LocalePriorityList.add( + sorted.toArray(new ULocale[sorted.size()])) + .build()); + for (ULocale loc : sorted) { + String stringLoc = loc.toString(); + assertEquals(stringLoc, matcher.getBestMatch(stringLoc).toString()); + } + } + + @Test + public void testComputeDistance_monkeyTest() { + String[] codes = ULocale.getISOCountries(); + Random random = new Random(); + XLocaleMatcher lm = newXLocaleMatcher(); + for (int i = 0; i < 1000; ++i) { + String x = codes[random.nextInt(codes.length)]; + String y = codes[random.nextInt(codes.length)]; + double d = lm.distance(ULocale.forLanguageTag("xx-Xxxx-"+x), ULocale.forLanguageTag("xx-Xxxx-"+y)); + if (x.equals("ZZ") || y.equals("ZZ")) { + assertEquals("dist(regionDistance," + x + ") = 0", REGION_DISTANCE, d); + } else if (x.equals(y)) { + assertEquals("dist(x,x) = 0", 0.0, d); + } else { + assertTrue("dist(" + x + "," + y + ") > 0", d > 0); + assertTrue("dist(" + x + "," + y + ") ≤ " + REGION_DISTANCE, d <= REGION_DISTANCE); + } + } + } + + + @Test + public void testPerf() { + if (LANGUAGE_MATCHER_DATA == null) { + return; // skip except when testing data + } + final ULocale desired = new ULocale("sv"); + + final String shortList = "en, sv"; + final String longList = "af, am, ar, az, be, bg, bn, bs, ca, cs, cy, cy, da, de, el, en, en-GB, es, es-419, et, eu, fa, fi, fil, fr, ga, gl, gu, hi, hr, hu, hy, id, is, it, iw, ja, ka, kk, km, kn, ko, ky, lo, lt, lv, mk, ml, mn, mr, ms, my, ne, nl, no, pa, pl, pt, pt-PT, ro, ru, si, sk, sl, sq, sr, sr-Latn, sv, sw, ta, te, th, tr, uk, ur, uz, vi, zh-CN, zh-TW, zu"; + final String veryLongList = "af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001, ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ, ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS, ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_Cyrl, az_Cyrl_AZ, az_Latn, az_Latn_AZ, bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm, bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN, bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, ca, ca_AD, ca_ES, ca_ES_VALENCIA, ca_FR, ca_IT, ce, ce_RU, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cs_CZ, cu, cu_RU, cy, cy_GB, da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz, dz_BT, ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001, en_150, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE, en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK, en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI, en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM, en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE, en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG, en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA, en_NF, en_NG, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH, en_PK, en_PN, en_PR, en_PW, en_RW, en_SB, en_SC, en_SD, en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC, en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX, en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo, eo_001, es, es_419, es_AR, es_BO, es_CL, es_CO, es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN, es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV, es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM, fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI, fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM, fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT, fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR, fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN, fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT, fy, fy_NL, ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI, gu, gu_IN, guz, guz_KE, gv, gv_IM, ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN, hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM, id, id_ID, ig, ig_NG, ii, ii_CN, is, is_IS, it, it_CH, it_IT, it_SM, ja, ja_JP, jgo, jgo_CM, jmc, jmc_TZ, ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln, kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN, ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, kw, kw_GB, ky, ky_KG, lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO, ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT, lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV, mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh, mgh_MZ, mgo, mgo_CM, mk, mk_MK, ml, ml_IN, mn, mn_MN, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua, mua_CM, my, my_MM, mzn, mzn_IR, naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR, nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, nus, nus_SS, nyn, nyn_UG, om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU, pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pl, pl_PL, prg, prg_001, ps, ps_AF, pt, pt_AO, pt_BR, pt_CV, pt_GW, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL, qu, qu_BO, qu_EC, qu_PE, rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, root, ru, ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ, sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, se, se_FI, se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW, so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr, sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_XK, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_XK, sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG, ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG, th, th_TH, ti, ti_ER, ti_ET, tk, tk_TM, to, to_TO, tr, tr_CY, tr_TR, twq, twq_NE, tzm, tzm_MA, ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_Arab, uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, vai, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vi_VN, vo, vo_001, vun, vun_TZ, wae, wae_CH, xog, xog_UG, yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO, zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zu, zu_ZA"; + + final XLocaleMatcher matcherShort = newXLocaleMatcher(shortList); + final XLocaleMatcher matcherLong = newXLocaleMatcher(longList); + final XLocaleMatcher matcherVeryLong = newXLocaleMatcher(veryLongList); + + final LocaleMatcher matcherShortOld = new LocaleMatcher(shortList); + final LocaleMatcher matcherLongOld = new LocaleMatcher(longList); + final LocaleMatcher matcherVeryLongOld = new LocaleMatcher(veryLongList); + + //XLocaleMatcher.DEBUG = true; + ULocale expected = new ULocale("sv"); + assertEquals(expected, matcherShort.getBestMatch(desired)); + assertEquals(expected, matcherLong.getBestMatch(desired)); + assertEquals(expected, matcherVeryLong.getBestMatch(desired)); + //XLocaleMatcher.DEBUG = false; + + long timeShortNew=0; + long timeMediumNew=0; + long timeLongNew=0; + + for (int i = 0; i < 2; ++i) { + int iterations = i == 0 ? 1000 : 1000000; + boolean showMessage = i != 0; + timeShortNew = timeXLocaleMatcher("Duration (few supported):\t", desired, matcherShort, showMessage, iterations); + timeMediumNew = timeXLocaleMatcher("Duration (med. supported):\t", desired, matcherLong, showMessage, iterations); + timeLongNew = timeXLocaleMatcher("Duration (many supported):\t", desired, matcherVeryLong, showMessage, iterations); + } + + long timeShortOld=0; + long timeMediumOld=0; + long timeLongOld=0; + + for (int i = 0; i < 2; ++i) { + int iterations = i == 0 ? 1000 : 100000; + boolean showMessage = i != 0; + timeShortOld = timeLocaleMatcher("Old Duration (few supported):\t", desired, matcherShortOld, showMessage, iterations); + timeMediumOld = timeLocaleMatcher("Old Duration (med. supported):\t", desired, matcherLongOld, showMessage, iterations); + timeLongOld = timeLocaleMatcher("Old Duration (many supported):\t", desired, matcherVeryLongOld, showMessage, iterations); + } + + assertTrue("timeShortNew (=" + timeShortNew + ") < 25% of timeShortOld (=" + timeShortOld + ")", timeShortNew * 4 < timeShortOld); + assertTrue("timeMediumNew (=" + timeMediumNew + ") < 25% of timeMediumOld (=" + timeMediumOld + ")", timeMediumNew * 4 < timeMediumOld); + assertTrue("timeLongNew (=" + timeLongNew + ") < 25% of timeLongOld (=" + timeLongOld + ")", timeLongNew * 4 < timeLongOld); + + } + + private long timeXLocaleMatcher(String title, ULocale desired, XLocaleMatcher matcher, + boolean showmessage, int iterations) { + long start = System.nanoTime(); + for (int i = iterations; i > 0; --i) { + matcher.getBestMatch(desired); + } + long delta = System.nanoTime() - start; + if (showmessage) logln(title + (delta / iterations) + " nanos"); + return (delta / iterations); + } + + private long timeLocaleMatcher(String title, ULocale desired, LocaleMatcher matcher, + boolean showmessage, int iterations) { + long start = System.nanoTime(); + for (int i = iterations; i > 0; --i) { + matcher.getBestMatch(desired); + } + long delta = System.nanoTime() - start; + if (showmessage) logln(title + (delta / iterations) + " nanos"); + return (delta / iterations); + } + + @Test + public void testDataDriven() throws IOException { + DataDrivenTestHelper tfh = new MyTestFileHandler() + .setFramework(this) + .run(XLocaleMatcherTest.class, "data/localeMatcherTest.txt"); + if (REFORMAT) { + System.out.println(tfh.appendLines(new StringBuilder())); + } + } + + private static final Splitter COMMA_SPACE = Splitter.on(Pattern.compile(",\\s*|\\s+")).trimResults(); + private static final Joiner JOIN_COMMA_SPACE = Joiner.on(", "); + private static final UnicodeSet DIGITS = new UnicodeSet("[0-9]").freeze(); + + class MyTestFileHandler extends DataDrivenTestHelper { + + Output bestDesired = new Output(); + DistanceOption distanceOption = DistanceOption.NORMAL; + int threshold = -1; + + @Override + public void handle(int lineNumber, boolean breakpoint, String commentBase, List arguments) { + List supported = COMMA_SPACE.splitToList(arguments.get(0)); + final String supportedReformatted = JOIN_COMMA_SPACE.join(supported); + LocalePriorityList supportedList = LocalePriorityList.add(supportedReformatted).build(); + + Iterable desired = COMMA_SPACE.split(arguments.get(1)); + final String desiredReformatted = JOIN_COMMA_SPACE.join(desired); + LocalePriorityList desiredList = LocalePriorityList.add(desiredReformatted).build(); + + String expected = arguments.get(2); + String expectedLanguageTag = expected.equals("null") ? null : new ULocale(expected).toLanguageTag(); + + String expectedUi = arguments.size() < 4 ? null : arguments.get(3); + String expectedUiLanguageTag = expectedUi == null || expectedUi.equals("null") ? null + : new ULocale(expectedUi).toLanguageTag(); + + if (breakpoint) { + breakpoint = false; // put debugger breakpoint here to break at @debug in test file + } + + XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL + ? newXLocaleMatcher(supportedList) + : newXLocaleMatcher(supportedList, threshold, distanceOption); + commentBase = "(" + lineNumber + ") " + commentBase; + + ULocale bestSupported; + if (expectedUi != null) { + bestSupported = matcher.getBestMatch(desiredList, bestDesired); + ULocale bestUI = XLocaleMatcher.combine(bestSupported, bestDesired.value); + assertEquals(commentBase + " (UI)", expectedUiLanguageTag, bestUI == null ? null : bestUI.toLanguageTag()); + } else { + bestSupported = matcher.getBestMatch(desiredList); + } + String bestMatchLanguageTag = bestSupported == null ? null : bestSupported.toLanguageTag(); + assertEquals(commentBase, expectedLanguageTag, bestMatchLanguageTag); + } + + @Override + public void handleParams(String comment, List arguments) { + String switchItem = arguments.get(0); + if (switchItem.equals("@DistanceOption")) { + distanceOption = DistanceOption.valueOf(arguments.get(1)); + } else if (switchItem.equals("@Threshold")) { + threshold = Integer.valueOf(arguments.get(1)); + } else { + super.handleParams(comment, arguments); + } + return; + } + } +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeDistanceTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeDistanceTest.txt new file mode 100644 index 00000000000..ba783b56984 --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeDistanceTest.txt @@ -0,0 +1,66 @@ +# © 2017 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html#License +# +# Data-driven test for XLocaleDistance. +# Format +# • supported ; desired ; dist(s,d) ; dist(d,x) +# • argument 4 only used when different +# • 100 = fail +# A line starting with @debug will reach a statement in the test code where you can put a breakpoint for debugging +# The test code also supports reformatting this file, by setting the REFORMAT flag. + +en-CA ; en-CA ; 0 +ar-MK ; en-CA ; 100 + +iw ; he ; 0 +zh ; cmn ; 0 + +# fallback languages get closer distances, between script (40) and region (4) + +@debug +to ; en ; 14 ; 100 +no ; no-DE ; 4 +nn ; no ; 10 +no-DE ; nn ; 14 +no ; no ; 0 +no ; da ; 12 +da ; zh-Hant ; 100 +zh-Hant ; zh-Hans ; 23 ; 19 +zh-Hans ; en ; 100 + +en-US ; en-AU ; 5 # across clusters +en-VI ; en-GU ; 4 # within cluster +en-AU ; en-CA ; 4 # within cluster + +# testScript +en-CA ; en-Cyrl ; 100 +en-Cyrl ; es-MX ; 100 + +hr ; sr ; 100 +#hr ; sr-Latn ; 8 +sr ; sr-Latn ; 5 + +# test419 +# Should be as good as any in cluster +es-MX ; es-AR ; 4 +@debug +es-MX ; es-419 ; 4 +es-MX ; es-MX ; 0 +es-MX ; es-ES ; 5 +es-MX ; es-PT ; 5 +es-MX ; es-150 ; 5 +es-419 ; es-AR ; 4 +es-419 ; es-419 ; 0 +es-419 ; es-MX ; 4 +es-419 ; es-ES ; 5 +es-419 ; es-PT ; 5 +es-419 ; es-150 ; 5 +es-ES ; es-AR ; 5 +es-ES ; es-419 ; 5 +es-ES ; es-MX ; 5 +es-ES ; es-ES ; 0 +es-ES ; es-PT ; 4 +es-419 ; es-150 ; 5 + +# testEuEc +xx-Xxxx-EC; xx-Xxxx-EU; 4 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt new file mode 100644 index 00000000000..0e3e3a582f7 --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt @@ -0,0 +1,387 @@ +# © 2017 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html#License +# +# Data-driven test for the XLocaleMatcher. +# Format +# • Everything after "#" is a comment +# • Arguments are separated by ";". They are: + +# supported ; desired ; expected + +# • The supported may have the threshold distance reset as a first item, eg 50, en, fr +# A line starting with @debug will reach a statement in the test code where you can put a breakpoint for debugging +# The test code also supports reformatting this file, by setting the REFORMAT flag. + +################################################## +# testParentLocales + +# es-419, es-AR, and es-MX are in a cluster; es is in a different one + +@debug +es-419, es-ES ; es-AR ; es-419 +es-ES, es-419 ; es-AR ; es-419 + +es-419, es ; es-AR ; es-419 +es, es-419 ; es-AR ; es-419 + +es-MX, es ; es-AR ; es-MX +es, es-MX ; es-AR ; es-MX + +# en-GB, en-AU, and en-NZ are in a cluster; en in a different one + +en-GB, en-US ; en-AU ; en-GB +en-US, en-GB ; en-AU ; en-GB + +en-GB, en ; en-AU ; en-GB +en, en-GB ; en-AU ; en-GB + +en-NZ, en-US ; en-AU ; en-NZ +en-US, en-NZ ; en-AU ; en-NZ + +en-NZ, en ; en-AU ; en-NZ +en, en-NZ ; en-AU ; en-NZ + +# pt-AU and pt-PT in one cluster; pt-BR in another + +pt-PT, pt-BR ; pt-AO ; pt-PT +pt-BR, pt-PT ; pt-AO ; pt-PT + +pt-PT, pt ; pt-AO ; pt-PT +pt, pt-PT ; pt-AO ; pt-PT + +zh-MO, zh-TW ; zh-HK ; zh-MO +zh-TW, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh-TW ; zh-HK ; zh-MO +zh-TW, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh-CN ; zh-HK ; zh-MO +zh-CN, zh-MO ; zh-HK ; zh-MO + +zh-MO, zh ; zh-HK ; zh-MO +zh, zh-MO ; zh-HK ; zh-MO + +################################################## +# testChinese + +zh-CN, zh-TW, iw ; zh-Hant-TW ; zh-TW +zh-CN, zh-TW, iw ; zh-Hant ; zh-TW +zh-CN, zh-TW, iw ; zh-TW ; zh-TW +zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN +zh-CN, zh-TW, iw ; zh-CN ; zh-CN +zh-CN, zh-TW, iw ; zh ; zh-CN + +################################################## +# testenGB + +fr, en, en-GB, es-419, es-MX, es ; en-NZ ; en-GB +fr, en, en-GB, es-419, es-MX, es ; es-ES ; es +fr, en, en-GB, es-419, es-MX, es ; es-AR ; es-419 +fr, en, en-GB, es-419, es-MX, es ; es-MX ; es-MX + +################################################## +# testFallbacks + +91, en, hi ; sa ; hi + +################################################## +# testBasics + +fr, en-GB, en ; en-GB ; en-GB +fr, en-GB, en ; en ; en +fr, en-GB, en ; fr ; fr +fr, en-GB, en ; ja ; fr # return first if no match + +################################################## +# testFallback + +# check that script fallbacks are handled right + +zh-CN, zh-TW, iw ; zh-Hant ; zh-TW +zh-CN, zh-TW, iw ; zh ; zh-CN +zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN +zh-CN, zh-TW, iw ; zh-Hant-HK ; zh-TW +zh-CN, zh-TW, iw ; he-IT ; iw + +################################################## +# testSpecials + +# check that nearby languages are handled + +en, fil, ro, nn ; tl ; fil +en, fil, ro, nn ; mo ; ro +en, fil, ro, nn ; nb ; nn + +# make sure default works + +en, fil, ro, nn ; ja ; en + +################################################## +# testRegionalSpecials + +# verify that en-AU is closer to en-GB than to en (which is en-US) + +en, en-GB, es, es-419 ; es-MX ; es-419 +en, en-GB, es, es-419 ; en-AU ; en-GB +en, en-GB, es, es-419 ; es-ES ; es + +################################################## +# testHK + +# HK and MO are closer to each other for Hant than to TW + +zh, zh-TW, zh-MO ; zh-HK ; zh-MO +zh, zh-TW, zh-HK ; zh-MO ; zh-HK + +################################################## +# testMatch-exact + +# see localeDistance.txt + +################################################## +# testMatch-none + +# see localeDistance.txt + +################################################## +# testMatch-matchOnMazimized + +zh, zh-Hant ; und-TW ; zh-Hant # und-TW should be closer to zh-Hant than to zh +en-Hant-TW, und-TW ; zh-Hant ; und-TW # zh-Hant should be closer to und-TW than to en-Hant-TW +en-Hant-TW, und-TW ; zh ; und-TW # zh should be closer to und-TW than to en-Hant-TW + +################################################## +# testMatchGrandfatheredCode + +fr, i-klingon, en-Latn-US ; en-GB-oed ; en-Latn-US + +################################################## +# testGetBestMatchForList-exactMatch +fr, en-GB, ja, es-ES, es-MX ; ja, de ; ja + +################################################## +# testGetBestMatchForList-simpleVariantMatch +fr, en-GB, ja, es-ES, es-MX ; de, en-US ; en-GB # Intentionally avoiding a perfect-match or two candidates for variant matches. + +# Fallback. + +fr, en-GB, ja, es-ES, es-MX ; de, zh ; fr + +################################################## +# testGetBestMatchForList-matchOnMaximized +# Check that if the preference is maximized already, it works as well. + +en, ja ; ja-Jpan-JP, en-AU ; ja # Match for ja-Jpan-JP (maximized already) + +# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB. + +en, ja ; ja-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already) + +# Check that if the preference is maximized already, it works as well. + +en, ja ; ja-Jpan-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already) + +################################################## +# testGetBestMatchForList-noMatchOnMaximized +# Regression test for http://b/5714572 . +# de maximizes to de-DE. Pick the exact match for the secondary language instead. +en, de, fr, ja ; de-CH, fr ; de + +################################################## +# testBestMatchForTraditionalChinese + +# Scenario: An application that only supports Simplified Chinese (and some other languages), +# but does not support Traditional Chinese. zh-Hans-CN could be replaced with zh-CN, zh, or +# zh-Hans, it wouldn't make much of a difference. + +# The script distance (simplified vs. traditional Han) is considered small enough +# to be an acceptable match. The regional difference is considered almost insignificant. + +fr, zh-Hans-CN, en-US ; zh-TW ; zh-Hans-CN +fr, zh-Hans-CN, en-US ; zh-Hant ; zh-Hans-CN + +# For geo-political reasons, you might want to avoid a zh-Hant -> zh-Hans match. +# In this case, if zh-TW, zh-HK or a tag starting with zh-Hant is requested, you can +# change your call to getBestMatch to include a 2nd language preference. +# "en" is a better match since its distance to "en-US" is closer than the distance +# from "zh-TW" to "zh-CN" (script distance). + +fr, zh-Hans-CN, en-US ; zh-TW, en ; en-US +fr, zh-Hans-CN, en-US ; zh-Hant-CN, en, en ; en-US +fr, zh-Hans-CN, en-US ; zh-Hans, en ; zh-Hans-CN + +################################################## +# testUndefined +# When the undefined language doesn't match anything in the list, +# getBestMatch returns the default, as usual. + +it, fr ; und ; it + +# When it *does* occur in the list, bestMatch returns it, as expected. +it, und ; und ; und + +# The unusual part: max("und") = "en-Latn-US", and since matching is based on maximized +# tags, the undefined language would normally match English. But that would produce the +# counterintuitive results that getBestMatch("und", XLocaleMatcher("it,en")) would be "en", and +# getBestMatch("en", XLocaleMatcher("it,und")) would be "und". + +# To avoid that, we change the matcher's definitions of max +# so that max("und")="und". That produces the following, more desirable +# results: + +it, en ; und ; it +it, und ; en ; it + +################################################## +# testGetBestMatch-regionDistance + +es-AR, es ; es-MX ; es-AR +fr, en, en-GB ; en-CA ; en-GB +de-AT, de-DE, de-CH ; de ; de-DE + +################################################## +# testAsymmetry + +mul, nl ; af ; nl # af => nl +mul, af ; nl ; mul # but nl !=> af + +################################################## +# testGetBestMatchForList-matchOnMaximized2 + +# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB. + +fr, en-GB, ja, es-ES, es-MX ; ja-JP, en-GB ; ja # Match for ja-JP, with likely region subtag + +# Check that if the preference is maximized already, it works as well. + +fr, en-GB, ja, es-ES, es-MX ; ja-Jpan-JP, en-GB ; ja # Match for ja-Jpan-JP (maximized already) + +################################################## +# testGetBestMatchForList-closeEnoughMatchOnMaximized + +en-GB, en, de, fr, ja ; de-CH, fr ; de +en-GB, en, de, fr, ja ; en-US, ar, nl, de, ja ; en + +################################################## +# testGetBestMatchForPortuguese + +# pt might be supported and not pt-PT + +# European user who prefers Spanish over Brazillian Portuguese as a fallback. + +pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT # pt implicit + +# Brazillian user who prefers South American Spanish over European Portuguese as a fallback. +# The asymmetry between this case and above is because it's "pt-PT" that's missing between the +# matchers as "pt-BR" is a much more common language. + +pt-PT, pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR +pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT +pt-PT, pt, es, es-419 ; pt, es-419, pt-PT ; pt + +pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR + +# Code that adds the user's country can get "pt-US" for a user's language. +# That should fall back to "pt-BR". + +pt-PT, pt-BR, es, es-419 ; pt-US, pt-PT ; pt-BR +pt-PT, pt, es, es-419 ; pt-US, pt-PT, pt ; pt # pt-BR implicit + +################################################## +# testVariantWithScriptMatch 1 and 2 + +fr, en, sv ; en-GB ; en +fr, en, sv ; en-GB ; en +en, sv ; en-GB, sv ; en + +################################################## +# testLongLists + +en, sv ; sv ; sv +af, am, ar, az, be, bg, bn, bs, ca, cs, cy, cy, da, de, el, en, en-GB, es, es-419, et, eu, fa, fi, fil, fr, ga, gl, gu, hi, hr, hu, hy, id, is, it, iw, ja, ka, kk, km, kn, ko, ky, lo, lt, lv, mk, ml, mn, mr, ms, my, ne, nl, no, pa, pl, pt, pt-PT, ro, ru, si, sk, sl, sq, sr, sr-Latn, sv, sw, ta, te, th, tr, uk, ur, uz, vi, zh-CN, zh-TW, zu ; sv ; sv +af, af-NA, af-ZA, agq, agq-CM, ak, ak-GH, am, am-ET, ar, ar-001, ar-AE, ar-BH, ar-DJ, ar-DZ, ar-EG, ar-EH, ar-ER, ar-IL, ar-IQ, ar-JO, ar-KM, ar-KW, ar-LB, ar-LY, ar-MA, ar-MR, ar-OM, ar-PS, ar-QA, ar-SA, ar-SD, ar-SO, ar-SS, ar-SY, ar-TD, ar-TN, ar-YE, as, as-IN, asa, asa-TZ, ast, ast-ES, az, az-Cyrl, az-Cyrl-AZ, az-Latn, az-Latn-AZ, bas, bas-CM, be, be-BY, bem, bem-ZM, bez, bez-TZ, bg, bg-BG, bm, bm-ML, bn, bn-BD, bn-IN, bo, bo-CN, bo-IN, br, br-FR, brx, brx-IN, bs, bs-Cyrl, bs-Cyrl-BA, bs-Latn, bs-Latn-BA, ca, ca-AD, ca-ES, ca-ES-VALENCIA, ca-FR, ca-IT, ce, ce-RU, cgg, cgg-UG, chr, chr-US, ckb, ckb-IQ, ckb-IR, cs, cs-CZ, cu, cu-RU, cy, cy-GB, da, da-DK, da-GL, dav, dav-KE, de, de-AT, de-BE, de-CH, de-DE, de-LI, de-LU, dje, dje-NE, dsb, dsb-DE, dua, dua-CM, dyo, dyo-SN, dz, dz-BT, ebu, ebu-KE, ee, ee-GH, ee-TG, el, el-CY, el-GR, en, en-001, en-150, en-AG, en-AI, en-AS, en-AT, en-AU, en-BB, en-BE, en-BI, en-BM, en-BS, en-BW, en-BZ, en-CA, en-CC, en-CH, en-CK, en-CM, en-CX, en-CY, en-DE, en-DG, en-DK, en-DM, en-ER, en-FI, en-FJ, en-FK, en-FM, en-GB, en-GD, en-GG, en-GH, en-GI, en-GM, en-GU, en-GY, en-HK, en-IE, en-IL, en-IM, en-IN, en-IO, en-JE, en-JM, en-KE, en-KI, en-KN, en-KY, en-LC, en-LR, en-LS, en-MG, en-MH, en-MO, en-MP, en-MS, en-MT, en-MU, en-MW, en-MY, en-NA, en-NF, en-NG, en-NL, en-NR, en-NU, en-NZ, en-PG, en-PH, en-PK, en-PN, en-PR, en-PW, en-RW, en-SB, en-SC, en-SD, en-SE, en-SG, en-SH, en-SI, en-SL, en-SS, en-SX, en-SZ, en-TC, en-TK, en-TO, en-TT, en-TV, en-TZ, en-UG, en-UM, en-US, en-US-POSIX, en-VC, en-VG, en-VI, en-VU, en-WS, en-ZA, en-ZM, en-ZW, eo, eo-001, es, es-419, es-AR, es-BO, es-CL, es-CO, es-CR, es-CU, es-DO, es-EA, es-EC, es-ES, es-GQ, es-GT, es-HN, es-IC, es-MX, es-NI, es-PA, es-PE, es-PH, es-PR, es-PY, es-SV, es-US, es-UY, es-VE, et, et-EE, eu, eu-ES, ewo, ewo-CM, fa, fa-AF, fa-IR, ff, ff-CM, ff-GN, ff-MR, ff-SN, fi, fi-FI, fil, fil-PH, fo, fo-DK, fo-FO, fr, fr-BE, fr-BF, fr-BI, fr-BJ, fr-BL, fr-CA, fr-CD, fr-CF, fr-CG, fr-CH, fr-CI, fr-CM, fr-DJ, fr-DZ, fr-FR, fr-GA, fr-GF, fr-GN, fr-GP, fr-GQ, fr-HT, fr-KM, fr-LU, fr-MA, fr-MC, fr-MF, fr-MG, fr-ML, fr-MQ, fr-MR, fr-MU, fr-NC, fr-NE, fr-PF, fr-PM, fr-RE, fr-RW, fr-SC, fr-SN, fr-SY, fr-TD, fr-TG, fr-TN, fr-VU, fr-WF, fr-YT, fur, fur-IT, fy, fy-NL, ga, ga-IE, gd, gd-GB, gl, gl-ES, gsw, gsw-CH, gsw-FR, gsw-LI, gu, gu-IN, guz, guz-KE, gv, gv-IM, ha, ha-GH, ha-NE, ha-NG, haw, haw-US, he, he-IL, hi, hi-IN, hr, hr-BA, hr-HR, hsb, hsb-DE, hu, hu-HU, hy, hy-AM, id, id-ID, ig, ig-NG, ii, ii-CN, is, is-IS, it, it-CH, it-IT, it-SM, ja, ja-JP, jgo, jgo-CM, jmc, jmc-TZ, ka, ka-GE, kab, kab-DZ, kam, kam-KE, kde, kde-TZ, kea, kea-CV, khq, khq-ML, ki, ki-KE, kk, kk-KZ, kkj, kkj-CM, kl, kl-GL, kln, kln-KE, km, km-KH, kn, kn-IN, ko, ko-KP, ko-KR, kok, kok-IN, ks, ks-IN, ksb, ksb-TZ, ksf, ksf-CM, ksh, ksh-DE, kw, kw-GB, ky, ky-KG, lag, lag-TZ, lb, lb-LU, lg, lg-UG, lkt, lkt-US, ln, ln-AO, ln-CD, ln-CF, ln-CG, lo, lo-LA, lrc, lrc-IQ, lrc-IR, lt, lt-LT, lu, lu-CD, luo, luo-KE, luy, luy-KE, lv, lv-LV, mas, mas-KE, mas-TZ, mer, mer-KE, mfe, mfe-MU, mg, mg-MG, mgh, mgh-MZ, mgo, mgo-CM, mk, mk-MK, ml, ml-IN, mn, mn-MN, mr, mr-IN, ms, ms-BN, ms-MY, ms-SG, mt, mt-MT, mua, mua-CM, my, my-MM, mzn, mzn-IR, naq, naq-NA, nb, nb-NO, nb-SJ, nd, nd-ZW, ne, ne-IN, ne-NP, nl, nl-AW, nl-BE, nl-BQ, nl-CW, nl-NL, nl-SR, nl-SX, nmg, nmg-CM, nn, nn-NO, nnh, nnh-CM, nus, nus-SS, nyn, nyn-UG, om, om-ET, om-KE, or, or-IN, os, os-GE, os-RU, pa, pa-Arab, pa-Arab-PK, pa-Guru, pa-Guru-IN, pl, pl-PL, prg, prg-001, ps, ps-AF, pt, pt-AO, pt-BR, pt-CV, pt-GW, pt-MO, pt-MZ, pt-PT, pt-ST, pt-TL, qu, qu-BO, qu-EC, qu-PE, rm, rm-CH, rn, rn-BI, ro, ro-MD, ro-RO, rof, rof-TZ, root, ru, ru-BY, ru-KG, ru-KZ, ru-MD, ru-RU, ru-UA, rw, rw-RW, rwk, rwk-TZ, sah, sah-RU, saq, saq-KE, sbp, sbp-TZ, se, se-FI, se-NO, se-SE, seh, seh-MZ, ses, ses-ML, sg, sg-CF, shi, shi-Latn, shi-Latn-MA, shi-Tfng, shi-Tfng-MA, si, si-LK, sk, sk-SK, sl, sl-SI, smn, smn-FI, sn, sn-ZW, so, so-DJ, so-ET, so-KE, so-SO, sq, sq-AL, sq-MK, sq-XK, sr, sr-Cyrl, sr-Cyrl-BA, sr-Cyrl-ME, sr-Cyrl-RS, sr-Cyrl-XK, sr-Latn, sr-Latn-BA, sr-Latn-ME, sr-Latn-RS, sr-Latn-XK, sv, sv-AX, sv-FI, sv-SE, sw, sw-CD, sw-KE, sw-TZ, sw-UG, ta, ta-IN, ta-LK, ta-MY, ta-SG, te, te-IN, teo, teo-KE, teo-UG, th, th-TH, ti, ti-ER, ti-ET, tk, tk-TM, to, to-TO, tr, tr-CY, tr-TR, twq, twq-NE, tzm, tzm-MA, ug, ug-CN, uk, uk-UA, ur, ur-IN, ur-PK, uz, uz-Arab, uz-Arab-AF, uz-Cyrl, uz-Cyrl-UZ, uz-Latn, uz-Latn-UZ, vai, vai-Latn, vai-Latn-LR, vai-Vaii, vai-Vaii-LR, vi, vi-VN, vo, vo-001, vun, vun-TZ, wae, wae-CH, xog, xog-UG, yav, yav-CM, yi, yi-001, yo, yo-BJ, yo-NG, zgh, zgh-MA, zh, zh-Hans, zh-Hans-CN, zh-Hans-HK, zh-Hans-MO, zh-Hans-SG, zh-Hant, zh-Hant-HK, zh-Hant-MO, zh-Hant-TW, zu, zu-ZA ; sv ; sv + +################################################## +# test8288 + +it, en ; und ; it +it, en ; und, en ; en + +# examples from +# http://unicode.org/repos/cldr/tags/latest/common/bcp47/ +# http://unicode.org/repos/cldr/tags/latest/common/validity/variant.xml + +################################################## +# testUnHack + +en-NZ, en-IT ; en-US ; en-NZ + +################################################## +# testEmptySupported => null + ; en ; null + +################################################## +# testVariantsAndExtensions +################################################## +# tests the .combine() method + +und, fr ; fr-BE-fonipa ; fr ; fr-BE-fonipa +und, fr-CA ; fr-BE-fonipa ; fr-CA ; fr-BE-fonipa +und, fr-fonupa ; fr-BE-fonipa ; fr-fonupa ; fr-BE-fonipa +und, no ; nn-BE-fonipa ; no ; no-BE-fonipa +und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin + +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK + +################################################## +# testClusters +# we favor es-419 over others in cluster. Clusters: es- {ES, MA, EA} {419, AR, MX} + +und, es, es-MA, es-MX, es-419 ; es-AR ; es-419 +und, es-MA, es, es-419, es-MX ; es-AR ; es-419 +und, es, es-MA, es-MX, es-419 ; es-EA ; es +und, es-MA, es, es-419, es-MX ; es-EA ; es + +# of course, fall back to within cluster + +und, es, es-MA, es-MX ; es-AR ; es-MX +und, es-MA, es, es-MX ; es-AR ; es-MX +und, es-MA, es-MX, es-419 ; es-EA ; es-MA +und, es-MA, es-419, es-MX ; es-EA ; es-MA + +# we favor es-GB over others in cluster. Clusters: en- {US, GU, VI} {GB, IN, ZA} + +und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB +und, en-GU, en, en-GB, en-IN ; en-ZA ; en-GB +und, en, en-GU, en-IN, en-GB ; en-VI ; en +und, en-GU, en, en-GB, en-IN ; en-VI ; en + +# of course, fall back to within cluster + +und, en, en-GU, en-IN ; en-ZA ; en-IN +und, en-GU, en, en-IN ; en-ZA ; en-IN +und, en-GU, en-IN, en-GB ; en-VI ; en-GU +und, en-GU, en-GB, en-IN ; en-VI ; en-GU + +################################################## +# testThreshold +@Threshold=60 + +50, und, fr-CA-fonupa ; fr-BE-fonipa ; fr-CA-fonupa ; fr-BE-fonipa +50, und, fr-Cyrl-CA-fonupa ; fr-BE-fonipa ; fr-Cyrl-CA-fonupa ; fr-Cyrl-BE-fonipa + +@Threshold=-1 # restore + +################################################## +# testScriptFirst +@DistanceOption=SCRIPT_FIRST +@debug + +ru, fr ; zh, pl ; fr +ru, fr ; zh-Cyrl, pl ; ru +#hr, en-Cyrl; sr ; en-Cyrl +da, ru, hr; sr ; ru \ No newline at end of file -- 2.40.0