ICU-10705 More substantive changes were needed. The code didn't get the CLDR data...

author Mark Davis <mark@macchiato.com>

Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)

committer Mark Davis <mark@macchiato.com>

Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)
author Mark Davis <mark@macchiato.com>
Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)
committer Mark Davis <mark@macchiato.com>
Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java

index 26135ea212f33970aa5b39c0711c9ff0b6f998e0..c9a6d239e86d4423d2f8656a823b94fe6d0dcbc1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
@@ -1,6 +1,6 @@
  /*
   ****************************************************************************************
- * Copyright (C) 2009-2013, Google, Inc.; International Business Machines Corporation   *
+ * Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation   *
   * and others. All Rights Reserved.                                                     *
   ****************************************************************************************
   */
@@ -11,9 +11,11 @@ import java.util.Iterator;
  import java.util.LinkedHashMap;
  import java.util.LinkedHashSet;
  import java.util.Map;
+import java.util.TreeSet;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
+import com.ibm.icu.impl.ICUResourceBundle;
  import com.ibm.icu.impl.Row;
  import com.ibm.icu.impl.Row.R2;
  import com.ibm.icu.impl.Row.R3;
@@ -43,7 +45,10 @@ import com.ibm.icu.impl.Row.R3;
   * @stable ICU 4.4
   */
  public class LocaleMatcher {
-    private static final boolean DEBUG = false;
+    
+    private static boolean DEBUG = false;
+
+    private static final ULocale UNKNOWN_LOCALE = new ULocale("und");
  
      /**
       * Threshold for falling back to the default (first) language. May make this
@@ -56,6 +61,11 @@ public class LocaleMatcher {
       */
      private final ULocale defaultLanguage;
  
+    /**
+     * The default language, in case the threshold is not met.
+     */
+    private final double threshold;
+
      /**
       * Create a new language matcher. The highest-weighted language is the
       * default. That means that if no other language is matches closer than a given
@@ -89,12 +99,24 @@ public class LocaleMatcher {
       * @deprecated This API is ICU internal only.
       */
      public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
+        this(languagePriorityList, matcherData, DEFAULT_THRESHOLD);
+    }
+
+    /**
+     * Internal testing function; may expose API later.
+     * @param languagePriorityList LocalePriorityList to match
+     * @param matcherData Internal matching data
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData, double threshold) {
          this.matcherData = matcherData;
          for (final ULocale language : languagePriorityList) {
              add(language, languagePriorityList.getWeight(language));
          }
          Iterator<ULocale> it = languagePriorityList.iterator();
          defaultLanguage = it.hasNext() ? it.next() : null;
+        this.threshold = threshold;
      }
  
  
@@ -136,7 +158,7 @@ public class LocaleMatcher {
                      lang2 == null ? lang : lang2,
                              script2 == null ? script : script2,
                                      region2 == null ? region : region2
-            );
+                    );
          }
          return ulocale;
      }
@@ -159,7 +181,7 @@ public class LocaleMatcher {
                  bestTableMatch = matchRow.get0();
              }
          }
-        if (bestWeight < DEFAULT_THRESHOLD) {
+        if (bestWeight < threshold) {
              bestTableMatch = defaultLanguage;
          }
          return bestTableMatch;
@@ -187,6 +209,14 @@ public class LocaleMatcher {
          return getBestMatchInternal(ulocale).get0();
      }
  
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    public ULocale getBestMatch(ULocale... ulocales) {
+        return getBestMatch(LocalePriorityList.add(ulocales).build());
+    }
+
      /**
       * {@inheritDoc}
       * @stable ICU 4.4
@@ -194,7 +224,7 @@ public class LocaleMatcher {
      @Override
      public String toString() {
          return "{" + defaultLanguage + ", " 
-        + maximizedLanguageToWeight + "}";
+                + maximizedLanguageToWeight + "}";
      }
      // ================= Privates =====================
  
@@ -217,7 +247,7 @@ public class LocaleMatcher {
              R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
              final double match = match(languageCode, maximized, tableKey, row.get0());
              if (DEBUG) {
-                System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
+                System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match + "\n");
              }
              final double weight = match * row.get1();
              if (weight > bestWeight) {
@@ -225,7 +255,7 @@ public class LocaleMatcher {
                  bestTableMatch = tableKey;
              }
          }
-        if (bestWeight < DEFAULT_THRESHOLD) {
+        if (bestWeight < threshold) {
              bestTableMatch = defaultLanguage;
          }
          return Row.R2.of(bestTableMatch, bestWeight);
@@ -252,6 +282,16 @@ public class LocaleMatcher {
       */
      // TODO(markdavis): update the above when CLDR 1.6 is final.
      private ULocale addLikelySubtags(ULocale languageCode) {
+        // max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined
+        // language would normally match English.  But that would produce the counterintuitive results
+        // that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and
+        // getBestMatch("en", LocaleMatcher("it,und")) would be "und".
+        //
+        // To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults)
+        // so that max("und")="und". That produces the following, more desirable results:
+        if (languageCode.equals(UNKNOWN_LOCALE)) {
+            return UNKNOWN_LOCALE;
+        }
          final ULocale result = ULocale.addLikelySubtags(languageCode);
          // should have method on getLikelySubtags for this
          if (result == null || result.equals(languageCode)) {
@@ -275,9 +315,9 @@ public class LocaleMatcher {
          private String region;
          private Level level;
          static Pattern pattern = Pattern.compile(
-                "([a-zA-Z]{1,8}|\\*)" +
-                "(?:-([a-zA-Z]{4}|\\*))?" +
-        "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
+                "([a-z]{1,8}|\\*)"
+                        + "(?:[_-]([A-Z][a-z]{3}|\\*))?"
+                        + "(?:[_-]([A-Z]{2}|[0-9]{3}|\\*))?");
  
          public LocalePatternMatcher(String toMatch) {
              Matcher matcher = pattern.matcher(toMatch);
@@ -341,16 +381,32 @@ public class LocaleMatcher {
          }
      }
  
-    enum Level {language, script, region}
+    enum Level {
+        language(0.99),
+        script(0.2), 
+        region(0.04);
+
+        final double worst;
+
+        Level(double d) {
+            worst = d;
+        }
+    }
  
      private static class ScoreData implements Freezable<ScoreData> {
+        /**
+         * 
+         */
+        private static final double maxUnequal_changeD_sameS = 0.5;
+        /**
+         * 
+         */
+        private static final double maxUnequal_changeEqual = 0.75;
          LinkedHashSet<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
-        final double worst;
          final Level level;
  
          public ScoreData(Level level) {
              this.level = level;
-            this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
          }
  
          void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
@@ -385,10 +441,13 @@ public class LocaleMatcher {
               *  else
               *   rd = 0.25*StdRDiff // lines 2,5
               */
+            
+            // example: input en-GB, supported en en-GB
+            // we want to have a closer match with 
  
              boolean desiredChange = desiredRaw.equals(desiredMax);
              boolean supportedChange = supportedRaw.equals(supportedMax);
-            double distance;
+            double distance = 0;
              if (!desiredMax.equals(supportedMax)) {
                  //                Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
                  //                if (lang_result == null) {
@@ -401,42 +460,63 @@ public class LocaleMatcher {
                  //                    } else {
                  distance = getRawScore(dMax, sMax);
                  //                }
-                if (desiredChange == supportedChange) {
-                    distance *= 0.75;
-                } else if (desiredChange) {
-                    distance *= 0.5;
-                }
-            } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
-                distance = 0;
+//                if (desiredChange == supportedChange) {
+//                    distance *= maxUnequal_changeEqual;
+//                    if (DEBUG) {
+//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD=changeS)\t" + distance);
+//                    }
+//                } else if (desiredChange) {
+//                    distance *= maxUnequal_changeD_sameS;
+//                    if (DEBUG) {
+//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD, !changeS)\t" + distance);
+//                    }
+//                } else {
+//                    if (DEBUG) {
+//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, !changeD, changeS)\t" + distance);
+//                    }
+//                }
+            } else if (!desiredRaw.equals(supportedRaw)) { // maxes are equal, changes are equal
+                distance += 0.001;
+//                if (DEBUG) {
+//                    System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD=changeS)\t" + distance);
+//                }
              } else { // maxes are equal, changes are different
-                distance = 0.25*worst;
+//                distance = 0.25*level.worst;
+//                if (DEBUG) {
+//                    System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD≠changeS)\t" + distance);
+//                }
              }
              return distance;
          }
  
          private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
              if (DEBUG) {
-                System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
+                System.out.println("\t\t\t" + level + " Raw Score:\t" + desiredLocale + ";\t" + supportedLocale);
              }
              for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
                  if (datum.get0().matches(desiredLocale) 
                          && datum.get1().matches(supportedLocale)) {
                      if (DEBUG) {
-                        System.out.println("\t\t\tFOUND\t" + datum);
+                        System.out.println("\t\t\t\tFOUND\t" + datum);
                      }
                      return datum.get2();
                  }
              }
              if (DEBUG) {
-                System.out.println("\t\t\tNOTFOUND\t" + worst);
+                System.out.println("\t\t\t\tNOTFOUND\t" + level.worst);
              }
-            return worst;
+            return level.worst;
          }
  
          public String toString() {
-            return level + ", " + scores;
+            StringBuilder result = new StringBuilder().append(level);
+            for (R3<LocalePatternMatcher, LocalePatternMatcher, Double> score : scores) {
+                result.append("\n\t\t").append(score);
+            }
+            return result.toString();
          }
  
+
          @SuppressWarnings("unchecked")
          public ScoreData cloneAsThawed() {
              try {
@@ -478,6 +558,14 @@ public class LocaleMatcher {
          public LanguageMatcherData() {
          }
  
+        /**
+         * @internal
+         * @deprecated This API is ICU internal only.
+         */
+        public String toString() {
+            return languageScores + "\n\t" + scriptScores + "\n\t" + regionScores;
+        }
+
          /**
           * @internal
           * @deprecated This API is ICU internal only.
@@ -489,13 +577,16 @@ public class LocaleMatcher {
              diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
  
              if (!a.getVariant().equals(b.getVariant())) {
-                diff += 1;
+                diff += 0.01;
              }
              if (diff < 0.0d) {
                  diff = 0.0d;
              } else if (diff > 1.0d) {
                  diff = 1.0d;
              }
+            if (DEBUG) {
+                System.out.println("\t\t\tTotal Distance\t" + diff);
+            }
              return 1.0 - diff;
          }
  
@@ -551,7 +642,7 @@ public class LocaleMatcher {
              LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
              Level supportedLen = supportedMatcher.getLevel();
              if (desiredLen != supportedLen) {
-                throw new IllegalArgumentException();
+                throw new IllegalArgumentException("Lengths unequal: " + desired + ", " + supported);
              }
              R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
              R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
@@ -626,39 +717,150 @@ public class LocaleMatcher {
  
      LanguageMatcherData matcherData;
  
-    private static LanguageMatcherData defaultWritten = new LanguageMatcherData()
-    // TODO get data from CLDR
-    .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
-    .addDistance("nn", "nb", 96)
-    .addDistance("nn", "no", 96)
-    .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
-    .addDistance("da", "nb", 90)
-    .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
-    .addDistance("sh", "br", 96)
-    .addDistance("sr", "br", 96)
-    .addDistance("sh", "hr", 96)
-    .addDistance("sr", "hr", 96)
-    .addDistance("sh", "sr", 96)
-    .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
-    .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
-    .addDistance("*-Hant", "*-Hans", 75, true)
-    .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
-    .addDistance("en-*-US", "en-*-*", 97)
-    .addDistance("en-*-CA", "en-*-*", 98)
-    .addDistance("en-*-*", "en-*-*", 99)
-    .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
-    .addDistance("es-*-ES", "es-*-*", 93)
-    .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
-    .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
-    .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
-    .freeze();
+    private static final LanguageMatcherData defaultWritten;
+//    = new LanguageMatcherData()
+//    // TODO get data from CLDR
+//    .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
+//    .addDistance("nn", "nb", 96)
+//    .addDistance("nn", "no", 96)
+//    .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
+//    .addDistance("da", "nb", 90)
+//    .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
+//    .addDistance("sh", "br", 96)
+//    .addDistance("sr", "br", 96)
+//    .addDistance("sh", "hr", 96)
+//    .addDistance("sr", "hr", 96)
+//    .addDistance("sh", "sr", 96)
+//    .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
+//    .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
+//    .addDistance("*-Hant", "*-Hans", 75, true)
+//    .addDistance("en-*-US", "en-*-*", 97, "Non-US English variants are closer to each other (written). Make en-US be further from everything else.")
+//    .addDistance("en-*-*", "en-*-*", 99)
+//    .addDistance("es-*-ES", "es-*-*", 97, "Latin American Spanishes are closer to each other. Make es-ES be further from everything else.")
+//    .addDistance("es-*-419", "es-*-*", 99, "Have es-MX, es-AR, etc be closer to es-419 than to each other")
+//    .addDistance("es-*-*", "es-*-*", 97)
+//    .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
+//    .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
+//    .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
+//    .freeze();
  
      private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
  
+    static class DataHack implements Comparable<DataHack>{
+        final String source;
+        final String target;
+        int percent;
+        public DataHack(String source, String target, int percent) {
+            this.source = source;
+            this.target = target.equals("de_CH") ? "de" : target; // hack to fix bad data
+            this.percent = percent;
+        }
+        static final Pattern STAR_KEEP = Pattern.compile("([^_]+)(?:_[^_]+(?:_[^_]+)?)?");
+        public int compareTo(DataHack other) {
+            // this is just a one-time hack so we don't need to optimize
+            int diff = getUnderbars(source) - getUnderbars(other.source);
+            if (0 != diff) {
+                return diff;
+            }
+            String thisSource = source.replace('*', 'þ'); // just something after Z
+            String otherSource = other.source.replace('*', 'þ'); // just something after Z
+            diff = thisSource.compareTo(otherSource);
+            if (0 != diff) {
+                return diff;
+            }
+            String thisTarget = target.replace('*', 'þ'); // just something after Z
+            String otherTarget = other.target.replace('*', 'þ'); // just something after Z
+            diff = thisTarget.compareTo(otherTarget);
+
+//            Matcher matcher = STAR_KEEP.matcher(source);
+//            matcher.matches();
+//            String first = matcher.group(0);
+//            String second = matcher.group(1);
+//            String third = matcher.group(2);
+//            Matcher matcherB = STAR_KEEP.matcher(source);
+//            String firstB = matcher.group(0);
+//            String secondB = matcher.group(1);
+//            String thirdB = matcher.group(2);
+//
+//            int diff = onlyStars.length() - onlyStarsOther.length();
+            
+            if (0 != diff) {
+                return diff;
+            }
+            diff = source.compareTo(other.source);
+            if (0 != diff) {
+                return diff;
+            }
+            return target.compareTo(other.target);
+        }
+        /**
+         * @param source2
+         */
+        private int getUnderbars(String source2) {
+            int pos = source2.indexOf('_');
+            if (pos < 0) {
+                return 0;
+            }
+            pos = source2.indexOf('_',pos+1);
+            return pos < 0 ? 1 : 2;
+        }
+        public String toString() {
+            return source + ", " + target + " => " + percent;
+        }
+    }
+    
      static {
          // TODO get data from CLDR
          canonicalMap.put("iw", "he");
          canonicalMap.put("mo", "ro");
          canonicalMap.put("tl", "fil");
+        
+        ICUResourceBundle suppData = getICUSupplementalData();
+        ICUResourceBundle languageMatching = suppData.findTopLevel("languageMatching");
+        ICUResourceBundle written = (ICUResourceBundle) languageMatching.get("written");
+        defaultWritten = new LanguageMatcherData();
+        // HACK
+        // The data coming from ICU may be old, and badly ordered.
+        TreeSet<DataHack> hack = new TreeSet<DataHack>();
+        defaultWritten.addDistance("en_*_US", "en_*_*", 97);
+        defaultWritten.addDistance("en_*_GB", "en_*_*", 98);
+        defaultWritten.addDistance("es_*_ES", "es_*_*", 97);
+        defaultWritten.addDistance("es_*_419", "es_*_*", 99);
+        defaultWritten.addDistance("es_*_*", "es_*_*", 98);
+
+        for(UResourceBundleIterator iter = written.getIterator(); iter.hasNext();) {
+            ICUResourceBundle item = (ICUResourceBundle) iter.next();
+            /*
+            "*_*_*",
+            "*_*_*",
+            "96",
+             */
+            hack.add(new DataHack(item.getString(0), item.getString(1), Integer.parseInt(item.getString(2))));
+        }
+        for (DataHack dataHack : hack) {
+            defaultWritten.addDistance(dataHack.source, dataHack.target, dataHack.percent);
+        }
+        defaultWritten.freeze();
+    }
+    
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    public static ICUResourceBundle getICUSupplementalData() {
+        ICUResourceBundle suppData = (ICUResourceBundle) UResourceBundle.getBundleInstance(
+                ICUResourceBundle.ICU_BASE_NAME,
+                "supplementalData",
+                ICUResourceBundle.ICU_DATA_CLASS_LOADER);
+        return suppData;
+    }
+
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    public static double match(ULocale a, ULocale b) {
+        final LocaleMatcher matcher = new LocaleMatcher("");
+        return matcher.match(a, matcher.addLikelySubtags(a), b, matcher.addLikelySubtags(b));
      }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java

index e70378d75592a39fe79e4702fc2be98a59f60fac..acb687939c411f8f1589b4a696478c429072d0ee 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java
@@ -1,6 +1,6 @@
  /*
   *******************************************************************************
- * Copyright (C) 2010-2011, Google, Inc.; International Business Machines      *
+ * Copyright (C) 2010-2014, Google, Inc.; International Business Machines      *
   * Corporation and others. All Rights Reserved.                                *
   *******************************************************************************
   */
@@ -81,7 +81,7 @@ public class LocalePriorityList implements Iterable<ULocale> {
       * @return internal builder, for chaining
       * @stable ICU 4.4
       */
-    public static Builder add(ULocale languageCode) {
+    public static Builder add(ULocale... languageCode) {
          return new Builder().add(languageCode);
      }
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java

index 515e99221fc03a76a2cbffd785698ffc8d15aaf7..11d7d6eefe1d440a65d8512ef23d6c704f036c0e 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java
@@ -1,12 +1,15 @@
  /*
   ******************************************************************************************
- * Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation and *
+ * Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation and *
   * others. All Rights Reserved.                                                           *
   ******************************************************************************************
   */
  
  package com.ibm.icu.dev.test.util;
  
+import java.util.Set;
+import java.util.TreeSet;
+
  import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.util.LocaleMatcher;
  import com.ibm.icu.util.LocaleMatcher.LanguageMatcherData;
@@ -24,6 +27,41 @@ public class LocaleMatcherTest extends TestFmwk {
          new LocaleMatcherTest().run(args);
      }
  
+    public void testenGB() {
+        final LocaleMatcher matcher = new LocaleMatcher("fr, en, en_GB, es_MX, es_419, es");
+        assertEquals("en_GB", matcher.getBestMatch("en_NZ").toString());
+        assertEquals("es", matcher.getBestMatch("es_ES").toString());
+        assertEquals("es_419", matcher.getBestMatch("es_AR").toString());
+        assertEquals("es_MX", matcher.getBestMatch("es_MX").toString());
+    }
+
+    public void testFallbacks() {
+        final LocaleMatcher matcher = new LocaleMatcher("en, hi");
+        if (!logKnownIssue("10705", "Need new data from CLDR for languageMatching")) {
+            assertEquals("hi", matcher.getBestMatch("sa").toString());
+        }
+    }
+
+    public void testOverrideData() {
+        double threshold = 0.05;
+        LanguageMatcherData localeMatcherData = new LanguageMatcherData()
+        .addDistance("br", "fr", 10, true)
+        .addDistance("es", "cy", 10, true)
+        ;
+        logln(localeMatcherData.toString());
+
+        final LocaleMatcher matcher = new LocaleMatcher(
+                LocalePriorityList
+                .add(ULocale.ENGLISH)
+                .add(ULocale.FRENCH)
+                .add(ULocale.UK)
+                .build(), localeMatcherData , threshold);
+        logln(matcher.toString());
+
+        assertEquals(ULocale.FRENCH, matcher.getBestMatch(new ULocale("br")));
+        assertEquals(ULocale.ENGLISH, matcher.getBestMatch(new ULocale("es"))); // one way
+    }
+
      public void testBasics() {
          final LocaleMatcher matcher = new LocaleMatcher(LocalePriorityList.add(ULocale.FRENCH).add(ULocale.UK)
                  .add(ULocale.ENGLISH).build());
@@ -84,5 +122,224 @@ public class LocaleMatcherTest extends TestFmwk {
      private void assertEquals(Object expected, Object string) {
          assertEquals("", expected, string);
      }
+    private void assertNull(Object bestMatch) {
+        assertNull("", bestMatch);
+    }
+
+    public void testEmpty() {
+        final LocaleMatcher matcher = new LocaleMatcher("");
+        assertNull(matcher.getBestMatch(ULocale.FRENCH));
+    }
+
+    static final ULocale ENGLISH_CANADA = new ULocale("en_CA");
+
+    public void testMatch_exact() {
+        assertEquals(1.0,
+                LocaleMatcher.match(ENGLISH_CANADA, ENGLISH_CANADA));
+    }
+
+    public void testMatch_none() {
+        double match = LocaleMatcher.match(
+                new ULocale("ar_MK"),
+                ENGLISH_CANADA);
+        assertTrue("Actual < 0: " + match, 0 <= match);
+        assertTrue("Actual > 0.15 (~ language + script distance): " + match, 0.2 > match);
+    }
+
+    public void testMatch_matchOnMazimized() {
+        ULocale undTw = new ULocale("und_TW");
+        ULocale zhHant = new ULocale("zh_Hant");
+        double matchZh = LocaleMatcher.match(undTw, new ULocale("zh"));
+        double matchZhHant = LocaleMatcher.match(undTw, zhHant);
+        assertTrue("und_TW should be closer to zh_Hant (" + matchZhHant +
+                ") than to zh (" + matchZh + ")",
+                matchZh < matchZhHant);
+        double matchEnHantTw = LocaleMatcher.match(new ULocale("en_Hant_TW"),
+                zhHant);
+        assertTrue("zh_Hant should be closer to und_TW (" + matchZhHant +
+                ") than to en_Hant_TW (" + matchEnHantTw + ")",
+                matchEnHantTw < matchZhHant);
+        assertTrue("zh should be closer to und_TW (" + matchZh +
+                ") than to en_Hant_TW (" + matchEnHantTw + ")",
+                matchEnHantTw < matchZh);
+    }
+
+    public void testMatchGrandfatheredCode() {
+        final LocaleMatcher matcher = new LocaleMatcher("fr, i_klingon, en_Latn_US");
+        assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString());
+        //assertEquals("tlh", matcher.getBestMatch("i_klingon").toString());
+    }
+
+    public void testGetBestMatchForList_exactMatch() {
+        final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX");
+        assertEquals("ja", matcher.getBestMatch("ja, de").toString());
+    }
+
+    public void testGetBestMatchForList_simpleVariantMatch() {
+        final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX");
+        // Intentionally avoiding a perfect_match or two candidates for variant matches.
+        assertEquals("en_GB", matcher.getBestMatch("de, en_US").toString());
+        // Fall back.
+        assertEquals("fr", matcher.getBestMatch("de, zh").toString());
+    }
+
+    public void testGetBestMatchForList_matchOnMaximized() {
+        final LocaleMatcher matcher = new LocaleMatcher("en, ja");
+        //final LocaleMatcher matcher = new LocaleMatcher("fr, en, ja, es_ES, es_MX");
+        // Check that if the preference is maximized already, it works as well.
+        assertEquals("Match for ja_Jpan_JP (maximized already)",
+                "ja", matcher.getBestMatch("ja_Jpan_JP, en-AU").toString());
+        if (true) return;
+        // ja_JP matches ja on likely subtags, and it's listed first, thus it wins over
+        // thus it wins over the second preference en_GB.
+        assertEquals("Match for ja_JP, with likely region subtag",
+                "ja", matcher.getBestMatch("ja_JP, en_US").toString());
+        // Check that if the preference is maximized already, it works as well.
+        assertEquals("Match for ja_Jpan_JP (maximized already)",
+                "ja", matcher.getBestMatch("ja_Jpan_JP, en_US").toString());
+    }
+
+    public void testGetBestMatchForList_noMatchOnMaximized() {
+        // Regression test for http://b/5714572 .
+        final LocaleMatcher matcher = new LocaleMatcher("en, de, fr, ja");
+        // de maximizes to de_DE. Pick the exact match for the secondary language instead.
+        assertEquals("fr", matcher.getBestMatch("de_CH, fr").toString());
+    }
+
+    public void testBestMatchForTraditionalChinese() {
+        // Scenario: An application that only supports Simplified Chinese (and some other languages),
+        // but does not support Traditional Chinese. zh_Hans_CN could be replaced with zh_CN, zh, or
+        // zh_Hans, it wouldn't make much of a difference.
+        final LocaleMatcher matcher = new LocaleMatcher("fr, zh_Hans_CN, en_US");
+
+        // The script distance (simplified vs. traditional Han) is considered small enough
+        // to be an acceptable match. The regional difference is considered almost insignificant.
+        assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_TW").toString());
+        assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hant").toString());
+
+        // For geo_political reasons, you might want to avoid a zh_Hant -> zh_Hans match.
+        // In this case, if zh_TW, zh_HK or a tag starting with zh_Hant is requested, you can
+        // change your call to getBestMatch to include a 2nd language preference.
+        // "en" is a better match since its distance to "en_US" is closer than the distance
+        // from "zh_TW" to "zh_CN" (script distance).
+        assertEquals("en_US", matcher.getBestMatch("zh_TW, en").toString());
+        assertEquals("en_US", matcher.getBestMatch("zh_Hant_CN, en").toString());
+        assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hans, en").toString());
+    }
+
+    public void testUndefined() {
+        // When the undefined language doesn't match anything in the list, getBestMatch returns
+        // the default, as usual.
+        LocaleMatcher matcher = new LocaleMatcher("it,fr");
+        assertEquals("it", matcher.getBestMatch("und").toString());
+
+        // When it *does* occur in the list, BestMatch returns it, as expected.
+        matcher = new LocaleMatcher("it,und");
+        assertEquals("und", matcher.getBestMatch("und").toString());
+
+        // The unusual part:
+        // max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined
+        // language would normally match English.  But that would produce the counterintuitive results
+        // that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and
+        // getBestMatch("en", LocaleMatcher("it,und")) would be "und".
+        //
+        // To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults)
+        // so that max("und")="und". That produces the following, more desirable results:
+        matcher = new LocaleMatcher("it,en");
+        assertEquals("it", matcher.getBestMatch("und").toString());
+        matcher = new LocaleMatcher("it,und");
+        assertEquals("it", matcher.getBestMatch("en").toString());
+    }
+
+    //    public void testGetBestMatch_emptyList() {
+    //        final LocaleMatcher matcher = new LocaleMatcher(
+    //                new LocalePriorityList(new HashMap()));
+    //        assertNull(matcher.getBestMatch(ULocale.ENGLISH));
+    //    }
+
+    public void testGetBestMatch_googlePseudoLocales() {
+        // Google pseudo locales are primarily based on variant subtags.
+        // See http://sites/intl_eng/pseudo_locales.
+        // (See below for the region code based fall back options.)
+        final LocaleMatcher matcher = new LocaleMatcher(
+                "fr, pt");
+        assertEquals("fr", matcher.getBestMatch("de").toString());
+        assertEquals("fr", matcher.getBestMatch("en_US").toString());
+        assertEquals("fr", matcher.getBestMatch("en").toString());
+        assertEquals("pt", matcher.getBestMatch("pt_BR").toString());
+    }
+
+    public void testGetBestMatch_regionDistance() {
+        LocaleMatcher matcher = new LocaleMatcher("es_AR, es");
+        assertEquals("es_AR", matcher.getBestMatch("es_MX").toString());
+
+        matcher = new LocaleMatcher("fr, en, en_CA");
+        assertEquals("en_CA", matcher.getBestMatch("en_GB").toString());
+
+        matcher = new LocaleMatcher("de_AT, de_DE, de_CH");
+        assertEquals("de_DE", matcher.getBestMatch("de").toString());
+    }
+
+    /**
+     * If all the base languages are the same, then each sublocale matches itself most closely
+     */
+    public void testExactMatches() {
+        String lastBase = "";
+        TreeSet<ULocale> sorted = new TreeSet();
+        for (ULocale loc : ULocale.getAvailableLocales()) {
+            String language = loc.getLanguage();
+            if (!lastBase.equals(language)) {
+                check(sorted);
+                sorted.clear();
+                lastBase = language;
+            }
+            sorted.add(loc);
+        }
+        check(sorted);
+    }
+
+    private void check(Set<ULocale> sorted) {
+        if (sorted.isEmpty()) {
+            return;
+        }
+        check2(sorted);
+        ULocale first = sorted.iterator().next();
+        ULocale max = ULocale.addLikelySubtags(first);
+        sorted.add(max);
+        check2(sorted);
+    }
+    /**
+     * @param sorted
+     */
+    private void check2(Set<ULocale> sorted) {
+        // TODO Auto-generated method stub
+        logln("Checking: " + sorted);
+        LocaleMatcher matcher = new LocaleMatcher(
+                LocalePriorityList.add(
+                        sorted.toArray(new ULocale[sorted.size()]))
+                        .build());
+        for (ULocale loc : sorted) {
+            String stringLoc = loc.toString();
+            assertEquals(stringLoc, matcher.getBestMatch(stringLoc).toString());
+        }
+    }
+
  
+    //      public void testComputeDistance_monkeyTest() {
+    //        RegionCode[] codes = RegionCode.values();
+    //        Random random = new Random();
+    //        for (int i = 0; i < 1000; ++i) {
+    //          RegionCode x = codes[random.nextInt(codes.length)];
+    //          RegionCode y = codes[random.nextInt(codes.length)];
+    //          double d = LocaleMatcher.getRegionDistance(x, y, null, null);
+    //          if (x == RegionCode.ZZ || y == RegionCode.ZZ) {
+    //            assertEquals(LocaleMatcher.REGION_DISTANCE, d);
+    //          } else if (x == y) {
+    //            assertEquals(0.0, d);
+    //          } else {
+    //            assertTrue(d > 0);
+    //            assertTrue(d <= LocaleMatcher.REGION_DISTANCE);
+    //          }
+    //        }
+    //      }
  }
author	Mark Davis <mark@macchiato.com>
	Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)
committer	Mark Davis <mark@macchiato.com>
	Fri, 21 Feb 2014 14:39:12 +0000 (14:39 +0000)
icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java		patch \| blob \| history