]> granicus.if.org Git - icu/commitdiff
ICU-20092 Integrates new languagematcher enhancements into ICU4J.
authorNorbert Runge <nrunge@google.com>
Thu, 23 Aug 2018 20:31:30 +0000 (13:31 -0700)
committerShane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt

index 745f12a25cbde640936bb32300f081211f60492f..b5110dde8b72f765eb7fe5a03f8a141dac145450 100644 (file)
@@ -136,7 +136,9 @@ public class XLikelySubtags {
         //                //new UnicodeRegex().compileBnf(pat)
         //                );
         //
-        // TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future.
+        // NOTE: Should we fix this to check for format?
+        // ANSWER: Not required, since this is only called internally. Moreover, we deliberately
+        // use invalid language tags ("x1", "x2", etc.) to represent pseudo-locales. See below.
         static LSR from(String languageIdentifier) {
             String[] parts = languageIdentifier.split("[-_]");
             if (parts.length < 1 || parts.length > 3) {
@@ -147,19 +149,64 @@ public class XLikelySubtags {
             String p3 = parts.length < 3 ? "" : parts[2];
             return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
 
-            //            Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
-            //            if (!matcher.matches()) {
-            //                return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
-            //            }
-            //            System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
-            //            throw new ICUException("invalid language id");
+            //        Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
+            //        if (!matcher.matches()) {
+            //            return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
+            //        }
+            //        System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
+            //        throw new ICUException("invalid language id");
+        }
+
+        private static final HashMap<ULocale, LSR> pseudoReplacements = new HashMap<ULocale, LSR>(11);
+
+        // Note code in XLocaledistance.java handle pseudo-regions XA, XB, and XC, making them
+        // very distant from any other locale. Similarly, it establishes that any of the
+        // invalid locales below ("x1", "x2", ..., "x7", and "x8-en") are very distant
+        // from any other locale.
+        static {
+      String[][] source = {
+        {"x-bork", "x1", "", ""},
+        {"x-elmer", "x2", "", ""},
+        {"x-hacker", "x3", "", ""},
+        {"x-piglatin", "x4", "", ""},
+        {"x-pirate", "x5", "", ""},
+        {"en-XA", "x6", "", ""},
+        {"en-PSACCENT", "x6", "", ""}, // Note: same as for ex-XA
+        {"ar-XB", "x7", "", ""},
+        {"ar-PSBIDI", "x7", "", ""}, // Note: same as for ar-XB
+        {"en-XC", "x8", "en", ""}, // Note: language is stored in LSR.script field
+        {"en-PSCRACK", "x8", "en", ""}, // Note: same as for en-XC
+      };
+            for (int i = 0; i < source.length; ++i) {
+                pseudoReplacements.put(new ULocale(source[i][0]),
+                    new LSR(source[i][1], source[i][2], source[i][3]));
+            }
+
         }
 
         public static LSR from(ULocale locale) {
+            LSR replacement = pseudoReplacements.get(locale);
+            if (replacement != null) {
+                return replacement;
+            }
+            // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
+            if ("PSCRACK".equals(locale.getVariant())) {
+                return new LSR(
+                    "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
+            }
             return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry());
         }
 
         public static LSR fromMaximalized(ULocale locale) {
+            LSR replacement = pseudoReplacements.get(locale);
+            if (replacement != null) {
+                return replacement;
+            }
+            // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
+            if ("PSCRACK".equals(locale.getVariant())) {
+                return new LSR(
+                    "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
+            }
             return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry());
         }
 
index 48f043e7858f68a66ea21661052a0511bad7ac6a..bfde807b4ce5e40da34da489aad8ce46cbe2e409 100644 (file)
@@ -46,6 +46,10 @@ public class XLocaleDistance {
 
     public static final int ABOVE_THRESHOLD = 100;
 
+    // Activates debugging output to stderr with details of GetBestMatch.
+    // Be sure to set this to false before checking this in for production!
+    private static final boolean TRACE_DISTANCE = false;
+
     @Deprecated
     public static final String ANY = "�"; // matches any character. Uses value above any subtag.
 
@@ -441,6 +445,10 @@ public class XLocaleDistance {
 
         @Override
         public int getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals) {
+            if (TRACE_DISTANCE) {
+                System.err.printf("    Entering       getDistance: desired=%s supported=%s starEquals=%s\n",
+                    desired, supported, Boolean.toString(starEquals));
+            }
             boolean star = false;
             Map<String, DistanceNode> sub2 = subtables.get(desired);
             if (sub2 == null) {
@@ -462,7 +470,11 @@ public class XLocaleDistance {
             if (distanceTable != null) {
                 distanceTable.value = ((StringDistanceNode) value).distanceTable;
             }
-            return starEquals && star && desired.equals(supported) ? 0 : value.distance;
+            int result = starEquals && star && desired.equals(supported) ? 0 : value.distance;
+            if (TRACE_DISTANCE) {
+                System.err.printf("    Returning from getDistance: %d\n", result);
+            }
+            return result;
         }
 
         public void copy(StringDistanceTable other) {
@@ -619,6 +631,7 @@ public class XLocaleDistance {
                                 buffer.append('\t').append('#').append(id).append('\n');
                             } else {
                                 ((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer);
+                                buffer.append('\n');
                             }
                         } else {
                             buffer.append('\n');
@@ -726,17 +739,31 @@ public class XLocaleDistance {
      * ULocales must be in canonical, addLikelySubtags format. Returns distance
      */
     public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) {
-        return distanceRaw(desired.language, supported.language,
+        if (TRACE_DISTANCE) {
+            System.err.printf("  Entering       distanceRaw: desired=%s supported=%s "
+            + "threshold=%d preferred=%s\n",
+            desired, supported, threshold,
+            distanceOption.name());
+        }
+        int result = distanceRaw(desired.language, supported.language,
                 desired.script, supported.script,
                 desired.region, supported.region,
                 threshold, distanceOption);
+        if (TRACE_DISTANCE) {
+            System.err.printf("  Returning from distanceRaw: %d\n", result);
+        }
+        return result;
     }
 
-    public enum DistanceOption {NORMAL, SCRIPT_FIRST}
+    public enum DistanceOption {REGION_FIRST, SCRIPT_FIRST}
+    // NOTE: Replaced "NORMAL" with "REGION_FIRST". By default, scripts have greater weight
+    // than regions, so they might be considered the "normal" case.
 
     /**
      * Returns distance, from 0 to ABOVE_THRESHOLD.
-     * ULocales must be in canonical, addLikelySubtags format. Returns distance
+     * ULocales must be in canonical, addLikelySubtags format.
+     * (Exception: internal calls may pass any strings. They do this for pseudo-locales.)
+     * Returns distance.
      */
     public int distanceRaw(
             String desiredLang, String supportedLang,
@@ -942,6 +969,28 @@ public class XLocaleDistance {
                 }
             }
         }
+
+        // Pseudo regions should match no other regions.
+        // {"*-*-XA", "*-*-*", "0"},
+        // {"*-*-XB", "*-*-*", "0"},
+        // {"*-*-XC", "*-*-*", "0"},
+        // {"x1-*-*", "*-*-*", "0"},
+        // {"x2-*-*", "*-*-*", "0"},
+        // ...
+        // {"x8-*-*", "*-*-*", "0"},
+        List<String> supported = Arrays.asList("*", "*", "*");
+        for (String x : Arrays.asList("XA", "XB", "XC")) {
+            List<String> desired = Arrays.asList("*", "*", x);
+            add(defaultDistanceTable, desired, supported, 100);
+            add(defaultDistanceTable, supported, desired, 100);
+        }
+        // See XLikelySubtags.java for the mapping of pseudo-locales to x1 ... x8.
+        for (int i = 1; i <= 8; ++i) {
+            List<String> desired = Arrays.asList("x" + String.valueOf(i), "*", "*");
+            add(defaultDistanceTable, desired, supported, 100);
+            add(defaultDistanceTable, supported, desired, 100);
+        }
+
         if (PRINT_OVERRIDES) {
             System.out.println("\t\t</languageMatches>");
         }
index 09ef5f02d424c5b006219f6fb3aad6a77d547762..8782aab0416d4b57648f2b87ee50ee619cba5576 100644 (file)
@@ -27,6 +27,9 @@ public class XLocaleMatcher {
     private static final LSR UND = new LSR("und","","");
     private static final ULocale UND_LOCALE = new ULocale("und");
 
+    // Activates debugging output to stderr with details of GetBestMatch.
+    private static final boolean TRACE_MATCHER = false;
+
     // normally the default values, but can be set via constructor
 
     private final XLocaleDistance localeDistance;
@@ -60,7 +63,9 @@ public class XLocaleMatcher {
             return this;
         }
         public Builder setSupportedLocales(Set<ULocale> languagePriorityList) {
-            this.supportedLanguagesList = languagePriorityList;
+            Set<ULocale> temp = new LinkedHashSet<ULocale>(); // maintain order
+            temp.addAll(languagePriorityList);
+            this.supportedLanguagesList = temp;
             return this;
         }
 
@@ -114,6 +119,22 @@ public class XLocaleMatcher {
         public XLocaleMatcher build() {
             return new XLocaleMatcher(this);
         }
+
+        @Override
+        public String toString() {
+          StringBuilder s = new StringBuilder().append("{XLocaleMatcher.Builder");
+          if (!supportedLanguagesList.isEmpty()) {
+            s.append(" supported={").append(supportedLanguagesList.toString()).append("}");
+          }
+          if (defaultLanguage != null) {
+            s.append(" default=").append(defaultLanguage.toString());
+          }
+          if (thresholdDistance >= 0) {
+            s.append(String.format(" thresholdDistance=%d", thresholdDistance));
+          }
+          s.append(" preference=").append(distanceOption.name());
+          return s.append("}").toString();
+        }
     }
 
     /**
@@ -176,7 +197,8 @@ public class XLocaleMatcher {
     private Multimap<LSR,ULocale> extractLsrMap(Set<ULocale> languagePriorityList, Set<LSR> priorities) {
         Multimap<LSR, ULocale> builder = LinkedHashMultimap.create();
         for (ULocale item : languagePriorityList) {
-            final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item);
+            final LSR max = item.equals(UND_LOCALE) ? UND :
+            LSR.fromMaximalized(item);
             builder.put(max, item);
         }
         if (builder.size() > 1 && priorities != null) {
@@ -255,46 +277,65 @@ public class XLocaleMatcher {
         ULocale bestDesiredLocale = null;
         Collection<ULocale> bestSupportedLocales = null;
         int delta = 0;
-        mainLoop:
-            for (final Entry<LSR, ULocale> desiredLsrAndLocale : desiredLSRs.entries()) {
-                // quick check for exact match
-                ULocale desiredLocale = desiredLsrAndLocale.getValue();
-                LSR desiredLSR = desiredLsrAndLocale.getKey();
-                if (delta < bestDistance) {
-                    if (exactSupportedLocales.contains(desiredLocale)) {
-                        if (outputBestDesired != null) {
-                            outputBestDesired.value = desiredLocale;
-                        }
-                        return desiredLocale;
-                    }
-                    // quick check for maximized locale
-                    Collection<ULocale> found = supportedLanguages.get(desiredLSR);
-                    if (found != null) {
-                        // if we find one in the set, return first (lowest). We already know the exact one isn't there.
-                        if (outputBestDesired != null) {
-                            outputBestDesired.value = desiredLocale;
-                        }
-                        return found.iterator().next();
-                    }
+    mainLoop:
+        for (final Entry<LSR, Set<ULocale>> desiredLsrAndLocales : desiredLSRs.asMap().entrySet()) {
+          LSR desiredLSR = desiredLsrAndLocales.getKey();
+          for (ULocale desiredLocale : desiredLsrAndLocales.getValue()) {
+            // quick check for exact match
+            if (delta < bestDistance) {
+              if (exactSupportedLocales.contains(desiredLocale)) {
+                if (outputBestDesired != null) {
+                  outputBestDesired.value = desiredLocale;
+                }
+                if (TRACE_MATCHER) {
+                    System.err.printf(
+                              "Returning %s, which is an exact match for a supported language\n",
+                              desiredLocale);
+                 }
+                return desiredLocale;
+              }
+              // quick check for maximized locale
+              Collection<ULocale> found = supportedLanguages.get(desiredLSR);
+              if (found != null) {
+                // if we find one in the set, return first (lowest). We already know the exact one isn't
+                // there.
+                if (outputBestDesired != null) {
+                  outputBestDesired.value = desiredLocale;
                 }
-                for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
-                    int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(),
-                        thresholdDistance, distanceOption);
-                    if (distance < bestDistance) {
-                        bestDistance = distance;
-                        bestDesiredLocale = desiredLocale;
-                        bestSupportedLocales = supportedLsrAndLocale.getValue();
-                        if (distance == 0) {
-                            break mainLoop;
-                        }
-                    }
+                ULocale result = found.iterator().next();
+                if (TRACE_MATCHER) {
+                  System.err.printf("Returning %s\n", result.toString());
                 }
-                delta += demotionPerAdditionalDesiredLocale;
+                return result;
+              }
             }
+            for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
+              int distance =
+                  delta
+                      + localeDistance.distanceRaw(
+                          desiredLSR,
+                          supportedLsrAndLocale.getKey(),
+                          thresholdDistance,
+                          distanceOption);
+              if (distance < bestDistance) {
+                bestDistance = distance;
+                bestDesiredLocale = desiredLocale;
+                bestSupportedLocales = supportedLsrAndLocale.getValue();
+                if (distance == 0) {
+                  break mainLoop;
+                }
+              }
+            }
+            delta += demotionPerAdditionalDesiredLocale;
+          }
+        }
         if (bestDistance >= thresholdDistance) {
             if (outputBestDesired != null) {
                 outputBestDesired.value = null;
             }
+            if (TRACE_MATCHER) {
+              System.err.printf("Returning default %s\n", defaultLanguage.toString());
+            }
             return defaultLanguage;
         }
         if (outputBestDesired != null) {
@@ -302,10 +343,18 @@ public class XLocaleMatcher {
         }
         // pick exact match if there is one
         if (bestSupportedLocales.contains(bestDesiredLocale)) {
+            if (TRACE_MATCHER) {
+              System.err.printf(
+                  "Returning %s which matches a supported language\n", bestDesiredLocale.toString());
+            }
             return bestDesiredLocale;
         }
         // otherwise return first supported, combining variants and extensions from bestDesired
-        return bestSupportedLocales.iterator().next();
+        ULocale result = bestSupportedLocales.iterator().next();
+        if (TRACE_MATCHER) {
+          System.err.printf("Returning first supported language %s\n", result.toString());
+        }
+        return result;
     }
 
     /**
@@ -327,17 +376,24 @@ public class XLocaleMatcher {
             if (outputBestDesired != null) {
                 outputBestDesired.value = desiredLocale;
             }
+            if (TRACE_MATCHER) {
+              System.err.printf("Exact match with a supported locale.\n");
+            }
             return desiredLocale;
         }
         // quick check for maximized locale
-        if (distanceOption == DistanceOption.NORMAL) {
+        if (distanceOption == DistanceOption.REGION_FIRST) {
             Collection<ULocale> found = supportedLanguages.get(desiredLSR);
             if (found != null) {
                 // if we find one in the set, return first (lowest). We already know the exact one isn't there.
                 if (outputBestDesired != null) {
                     outputBestDesired.value = desiredLocale;
                 }
-                return found.iterator().next();
+                ULocale result = found.iterator().next();
+                if (TRACE_MATCHER) {
+                  System.err.printf("Matches a maximized supported locale: %s\n", result);
+                }
+                return result;
             }
         }
         for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
@@ -356,6 +412,11 @@ public class XLocaleMatcher {
             if (outputBestDesired != null) {
                 outputBestDesired.value = null;
             }
+            if (TRACE_MATCHER) {
+              System.err.printf(
+                  "Returning default %s because everything exceeded the threshold of %d.\n",
+                  defaultLanguage, thresholdDistance);
+            }
             return defaultLanguage;
         }
         if (outputBestDesired != null) {
@@ -366,7 +427,11 @@ public class XLocaleMatcher {
             return bestDesiredLocale;
         }
         // otherwise return first supported, combining variants and extensions from bestDesired
-        return bestSupportedLocales.iterator().next();
+        ULocale result = bestSupportedLocales.iterator().next();
+        if (TRACE_MATCHER) {
+          System.err.printf("First in the list of supported locales: %s\n", result);
+        }
+        return result;
     }
 
     /** Combine features of the desired locale into those of the supported, and return result. */
index 9f331e26ebf561314ca4ccdebd5e95582ccb6f10..2ea96a7fb1bc1af7b729bb3b5eea049c0347b8a4 100644 (file)
@@ -99,8 +99,8 @@ public class XLocaleDistanceTest extends TestFmwk {
                 newLikelyTime += System.nanoTime()-temp;
 
                 temp = System.nanoTime();
-                int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL);
-                int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL);
+                int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.REGION_FIRST);
+                int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.REGION_FIRST);
                 newTimeMinusLikely += System.nanoTime()-temp;
             }
         }
@@ -178,7 +178,7 @@ public class XLocaleDistanceTest extends TestFmwk {
     class MyTestFileHandler extends DataDrivenTestHelper {
         final XLocaleDistance distance = XLocaleDistance.getDefault();
         Output<ULocale> bestDesired = new Output<ULocale>();
-        private DistanceOption distanceOption = DistanceOption.NORMAL;
+        private DistanceOption distanceOption = DistanceOption.REGION_FIRST;
         private Integer threshold = distance.getDefaultScriptDistance();
 
         @Override
index 9fc94b1abb817101871e9b0518ecaf2ef8a8b7ef..c84d8c0a2d042d4141d598a9675548fb3094aa4a 100644 (file)
@@ -282,7 +282,7 @@ public class XLocaleMatcherTest extends TestFmwk {
     class MyTestFileHandler extends DataDrivenTestHelper {
 
         Output<ULocale> bestDesired = new Output<ULocale>();
-        DistanceOption distanceOption = DistanceOption.NORMAL;
+        DistanceOption distanceOption = DistanceOption.REGION_FIRST;
         int threshold = -1;
 
         @Override
@@ -305,8 +305,7 @@ public class XLocaleMatcherTest extends TestFmwk {
             if (breakpoint) {
                 breakpoint = false; // put debugger breakpoint here to break at @debug in test file
             }
-
-            XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL
+            XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.REGION_FIRST
                 ? newXLocaleMatcher(supportedList)
                 : newXLocaleMatcher(supportedList, threshold, distanceOption);
             commentBase = "(" + lineNumber + ") " + commentBase;
index 0e3e3a582f733ebc7bb846d07f19b49810a40548..55c0f3f5a962ae2f24f0fbb18f313b1bf5dd7648 100644 (file)
@@ -334,8 +334,8 @@ und, no ;   nn-BE-fonipa ;  no ;    no-BE-fonipa
 und, en-GB-u-sd-gbsct ;        en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ;    en-GB-u-sd-gbsct ;      en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin
 
 en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ;    fr-PSCRACK ;    fr-PSCRACK
-en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ;    fr ;    fr-PSCRACK
-en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ;    de-CH ;         de-PSCRACK
+en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ;    fr ;    en-PSCRACK           # was: fr-PSCRACK
+en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ;    de-CH ;         en-PSCRACK   # was: de-PSCRACK
 
 ##################################################
 # testClusters
@@ -384,4 +384,4 @@ und, en-GU, en-GB, en-IN ;  en-VI ;         en-GU
 ru, fr ; zh, pl ; fr
 ru, fr ; zh-Cyrl, pl ; ru
 #hr, en-Cyrl; sr ; en-Cyrl
-da, ru, hr; sr ; ru
\ No newline at end of file
+da, ru, hr; sr ; ru