From: Norbert Runge Date: Thu, 23 Aug 2018 20:31:30 +0000 (-0700) Subject: ICU-20092 Integrates new languagematcher enhancements into ICU4J. X-Git-Tag: release-63-rc~94^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c854dd0d5472e6596c1456b1655867de19968798;p=icu ICU-20092 Integrates new languagematcher enhancements into ICU4J. --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java index 745f12a25cb..b5110dde8b7 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java @@ -136,7 +136,9 @@ public class XLikelySubtags { // //new UnicodeRegex().compileBnf(pat) // ); // - // TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future. + // NOTE: Should we fix this to check for format? + // ANSWER: Not required, since this is only called internally. Moreover, we deliberately + // use invalid language tags ("x1", "x2", etc.) to represent pseudo-locales. See below. static LSR from(String languageIdentifier) { String[] parts = languageIdentifier.split("[-_]"); if (parts.length < 1 || parts.length > 3) { @@ -147,19 +149,64 @@ public class XLikelySubtags { String p3 = parts.length < 3 ? "" : parts[2]; return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3); - // Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier); - // if (!matcher.matches()) { - // return new LSR(matcher.group(1), matcher.group(2), matcher.group(3)); - // } - // System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier)); - // throw new ICUException("invalid language id"); + // Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier); + // if (!matcher.matches()) { + // return new LSR(matcher.group(1), matcher.group(2), matcher.group(3)); + // } + // System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier)); + // throw new ICUException("invalid language id"); + } + + private static final HashMap pseudoReplacements = new HashMap(11); + + // Note code in XLocaledistance.java handle pseudo-regions XA, XB, and XC, making them + // very distant from any other locale. Similarly, it establishes that any of the + // invalid locales below ("x1", "x2", ..., "x7", and "x8-en") are very distant + // from any other locale. + static { + String[][] source = { + {"x-bork", "x1", "", ""}, + {"x-elmer", "x2", "", ""}, + {"x-hacker", "x3", "", ""}, + {"x-piglatin", "x4", "", ""}, + {"x-pirate", "x5", "", ""}, + {"en-XA", "x6", "", ""}, + {"en-PSACCENT", "x6", "", ""}, // Note: same as for ex-XA + {"ar-XB", "x7", "", ""}, + {"ar-PSBIDI", "x7", "", ""}, // Note: same as for ar-XB + {"en-XC", "x8", "en", ""}, // Note: language is stored in LSR.script field + {"en-PSCRACK", "x8", "en", ""}, // Note: same as for en-XC + }; + for (int i = 0; i < source.length; ++i) { + pseudoReplacements.put(new ULocale(source[i][0]), + new LSR(source[i][1], source[i][2], source[i][3])); + } + } public static LSR from(ULocale locale) { + LSR replacement = pseudoReplacements.get(locale); + if (replacement != null) { + return replacement; + } + // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK. + if ("PSCRACK".equals(locale.getVariant())) { + return new LSR( + "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), ""); + } return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry()); } public static LSR fromMaximalized(ULocale locale) { + LSR replacement = pseudoReplacements.get(locale); + if (replacement != null) { + return replacement; + } + // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK. + if ("PSCRACK".equals(locale.getVariant())) { + return new LSR( + "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), ""); + } return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry()); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java index 48f043e7858..bfde807b4ce 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleDistance.java @@ -46,6 +46,10 @@ public class XLocaleDistance { public static final int ABOVE_THRESHOLD = 100; + // Activates debugging output to stderr with details of GetBestMatch. + // Be sure to set this to false before checking this in for production! + private static final boolean TRACE_DISTANCE = false; + @Deprecated public static final String ANY = "�"; // matches any character. Uses value above any subtag. @@ -441,6 +445,10 @@ public class XLocaleDistance { @Override public int getDistance(String desired, String supported, Output distanceTable, boolean starEquals) { + if (TRACE_DISTANCE) { + System.err.printf(" Entering getDistance: desired=%s supported=%s starEquals=%s\n", + desired, supported, Boolean.toString(starEquals)); + } boolean star = false; Map sub2 = subtables.get(desired); if (sub2 == null) { @@ -462,7 +470,11 @@ public class XLocaleDistance { if (distanceTable != null) { distanceTable.value = ((StringDistanceNode) value).distanceTable; } - return starEquals && star && desired.equals(supported) ? 0 : value.distance; + int result = starEquals && star && desired.equals(supported) ? 0 : value.distance; + if (TRACE_DISTANCE) { + System.err.printf(" Returning from getDistance: %d\n", result); + } + return result; } public void copy(StringDistanceTable other) { @@ -619,6 +631,7 @@ public class XLocaleDistance { buffer.append('\t').append('#').append(id).append('\n'); } else { ((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer); + buffer.append('\n'); } } else { buffer.append('\n'); @@ -726,17 +739,31 @@ public class XLocaleDistance { * ULocales must be in canonical, addLikelySubtags format. Returns distance */ public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) { - return distanceRaw(desired.language, supported.language, + if (TRACE_DISTANCE) { + System.err.printf(" Entering distanceRaw: desired=%s supported=%s " + + "threshold=%d preferred=%s\n", + desired, supported, threshold, + distanceOption.name()); + } + int result = distanceRaw(desired.language, supported.language, desired.script, supported.script, desired.region, supported.region, threshold, distanceOption); + if (TRACE_DISTANCE) { + System.err.printf(" Returning from distanceRaw: %d\n", result); + } + return result; } - public enum DistanceOption {NORMAL, SCRIPT_FIRST} + public enum DistanceOption {REGION_FIRST, SCRIPT_FIRST} + // NOTE: Replaced "NORMAL" with "REGION_FIRST". By default, scripts have greater weight + // than regions, so they might be considered the "normal" case. /** * Returns distance, from 0 to ABOVE_THRESHOLD. - * ULocales must be in canonical, addLikelySubtags format. Returns distance + * ULocales must be in canonical, addLikelySubtags format. + * (Exception: internal calls may pass any strings. They do this for pseudo-locales.) + * Returns distance. */ public int distanceRaw( String desiredLang, String supportedLang, @@ -942,6 +969,28 @@ public class XLocaleDistance { } } } + + // Pseudo regions should match no other regions. + // {"*-*-XA", "*-*-*", "0"}, + // {"*-*-XB", "*-*-*", "0"}, + // {"*-*-XC", "*-*-*", "0"}, + // {"x1-*-*", "*-*-*", "0"}, + // {"x2-*-*", "*-*-*", "0"}, + // ... + // {"x8-*-*", "*-*-*", "0"}, + List supported = Arrays.asList("*", "*", "*"); + for (String x : Arrays.asList("XA", "XB", "XC")) { + List desired = Arrays.asList("*", "*", x); + add(defaultDistanceTable, desired, supported, 100); + add(defaultDistanceTable, supported, desired, 100); + } + // See XLikelySubtags.java for the mapping of pseudo-locales to x1 ... x8. + for (int i = 1; i <= 8; ++i) { + List desired = Arrays.asList("x" + String.valueOf(i), "*", "*"); + add(defaultDistanceTable, desired, supported, 100); + add(defaultDistanceTable, supported, desired, 100); + } + if (PRINT_OVERRIDES) { System.out.println("\t\t"); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java index 09ef5f02d42..8782aab0416 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLocaleMatcher.java @@ -27,6 +27,9 @@ public class XLocaleMatcher { private static final LSR UND = new LSR("und","",""); private static final ULocale UND_LOCALE = new ULocale("und"); + // Activates debugging output to stderr with details of GetBestMatch. + private static final boolean TRACE_MATCHER = false; + // normally the default values, but can be set via constructor private final XLocaleDistance localeDistance; @@ -60,7 +63,9 @@ public class XLocaleMatcher { return this; } public Builder setSupportedLocales(Set languagePriorityList) { - this.supportedLanguagesList = languagePriorityList; + Set temp = new LinkedHashSet(); // maintain order + temp.addAll(languagePriorityList); + this.supportedLanguagesList = temp; return this; } @@ -114,6 +119,22 @@ public class XLocaleMatcher { public XLocaleMatcher build() { return new XLocaleMatcher(this); } + + @Override + public String toString() { + StringBuilder s = new StringBuilder().append("{XLocaleMatcher.Builder"); + if (!supportedLanguagesList.isEmpty()) { + s.append(" supported={").append(supportedLanguagesList.toString()).append("}"); + } + if (defaultLanguage != null) { + s.append(" default=").append(defaultLanguage.toString()); + } + if (thresholdDistance >= 0) { + s.append(String.format(" thresholdDistance=%d", thresholdDistance)); + } + s.append(" preference=").append(distanceOption.name()); + return s.append("}").toString(); + } } /** @@ -176,7 +197,8 @@ public class XLocaleMatcher { private Multimap extractLsrMap(Set languagePriorityList, Set priorities) { Multimap builder = LinkedHashMultimap.create(); for (ULocale item : languagePriorityList) { - final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item); + final LSR max = item.equals(UND_LOCALE) ? UND : + LSR.fromMaximalized(item); builder.put(max, item); } if (builder.size() > 1 && priorities != null) { @@ -255,46 +277,65 @@ public class XLocaleMatcher { ULocale bestDesiredLocale = null; Collection bestSupportedLocales = null; int delta = 0; - mainLoop: - for (final Entry desiredLsrAndLocale : desiredLSRs.entries()) { - // quick check for exact match - ULocale desiredLocale = desiredLsrAndLocale.getValue(); - LSR desiredLSR = desiredLsrAndLocale.getKey(); - if (delta < bestDistance) { - if (exactSupportedLocales.contains(desiredLocale)) { - if (outputBestDesired != null) { - outputBestDesired.value = desiredLocale; - } - return desiredLocale; - } - // quick check for maximized locale - Collection found = supportedLanguages.get(desiredLSR); - if (found != null) { - // if we find one in the set, return first (lowest). We already know the exact one isn't there. - if (outputBestDesired != null) { - outputBestDesired.value = desiredLocale; - } - return found.iterator().next(); - } + mainLoop: + for (final Entry> desiredLsrAndLocales : desiredLSRs.asMap().entrySet()) { + LSR desiredLSR = desiredLsrAndLocales.getKey(); + for (ULocale desiredLocale : desiredLsrAndLocales.getValue()) { + // quick check for exact match + if (delta < bestDistance) { + if (exactSupportedLocales.contains(desiredLocale)) { + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; + } + if (TRACE_MATCHER) { + System.err.printf( + "Returning %s, which is an exact match for a supported language\n", + desiredLocale); + } + return desiredLocale; + } + // quick check for maximized locale + Collection found = supportedLanguages.get(desiredLSR); + if (found != null) { + // if we find one in the set, return first (lowest). We already know the exact one isn't + // there. + if (outputBestDesired != null) { + outputBestDesired.value = desiredLocale; } - for (final Entry> supportedLsrAndLocale : supportedLanguages.entrySet()) { - int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(), - thresholdDistance, distanceOption); - if (distance < bestDistance) { - bestDistance = distance; - bestDesiredLocale = desiredLocale; - bestSupportedLocales = supportedLsrAndLocale.getValue(); - if (distance == 0) { - break mainLoop; - } - } + ULocale result = found.iterator().next(); + if (TRACE_MATCHER) { + System.err.printf("Returning %s\n", result.toString()); } - delta += demotionPerAdditionalDesiredLocale; + return result; + } } + for (final Entry> supportedLsrAndLocale : supportedLanguages.entrySet()) { + int distance = + delta + + localeDistance.distanceRaw( + desiredLSR, + supportedLsrAndLocale.getKey(), + thresholdDistance, + distanceOption); + if (distance < bestDistance) { + bestDistance = distance; + bestDesiredLocale = desiredLocale; + bestSupportedLocales = supportedLsrAndLocale.getValue(); + if (distance == 0) { + break mainLoop; + } + } + } + delta += demotionPerAdditionalDesiredLocale; + } + } if (bestDistance >= thresholdDistance) { if (outputBestDesired != null) { outputBestDesired.value = null; } + if (TRACE_MATCHER) { + System.err.printf("Returning default %s\n", defaultLanguage.toString()); + } return defaultLanguage; } if (outputBestDesired != null) { @@ -302,10 +343,18 @@ public class XLocaleMatcher { } // pick exact match if there is one if (bestSupportedLocales.contains(bestDesiredLocale)) { + if (TRACE_MATCHER) { + System.err.printf( + "Returning %s which matches a supported language\n", bestDesiredLocale.toString()); + } return bestDesiredLocale; } // otherwise return first supported, combining variants and extensions from bestDesired - return bestSupportedLocales.iterator().next(); + ULocale result = bestSupportedLocales.iterator().next(); + if (TRACE_MATCHER) { + System.err.printf("Returning first supported language %s\n", result.toString()); + } + return result; } /** @@ -327,17 +376,24 @@ public class XLocaleMatcher { if (outputBestDesired != null) { outputBestDesired.value = desiredLocale; } + if (TRACE_MATCHER) { + System.err.printf("Exact match with a supported locale.\n"); + } return desiredLocale; } // quick check for maximized locale - if (distanceOption == DistanceOption.NORMAL) { + if (distanceOption == DistanceOption.REGION_FIRST) { Collection found = supportedLanguages.get(desiredLSR); if (found != null) { // if we find one in the set, return first (lowest). We already know the exact one isn't there. if (outputBestDesired != null) { outputBestDesired.value = desiredLocale; } - return found.iterator().next(); + ULocale result = found.iterator().next(); + if (TRACE_MATCHER) { + System.err.printf("Matches a maximized supported locale: %s\n", result); + } + return result; } } for (final Entry> supportedLsrAndLocale : supportedLanguages.entrySet()) { @@ -356,6 +412,11 @@ public class XLocaleMatcher { if (outputBestDesired != null) { outputBestDesired.value = null; } + if (TRACE_MATCHER) { + System.err.printf( + "Returning default %s because everything exceeded the threshold of %d.\n", + defaultLanguage, thresholdDistance); + } return defaultLanguage; } if (outputBestDesired != null) { @@ -366,7 +427,11 @@ public class XLocaleMatcher { return bestDesiredLocale; } // otherwise return first supported, combining variants and extensions from bestDesired - return bestSupportedLocales.iterator().next(); + ULocale result = bestSupportedLocales.iterator().next(); + if (TRACE_MATCHER) { + System.err.printf("First in the list of supported locales: %s\n", result); + } + return result; } /** Combine features of the desired locale into those of the supported, and return result. */ diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java index 9f331e26ebf..2ea96a7fb1b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleDistanceTest.java @@ -99,8 +99,8 @@ public class XLocaleDistanceTest extends TestFmwk { newLikelyTime += System.nanoTime()-temp; temp = System.nanoTime(); - int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL); - int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL); + int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.REGION_FIRST); + int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.REGION_FIRST); newTimeMinusLikely += System.nanoTime()-temp; } } @@ -178,7 +178,7 @@ public class XLocaleDistanceTest extends TestFmwk { class MyTestFileHandler extends DataDrivenTestHelper { final XLocaleDistance distance = XLocaleDistance.getDefault(); Output bestDesired = new Output(); - private DistanceOption distanceOption = DistanceOption.NORMAL; + private DistanceOption distanceOption = DistanceOption.REGION_FIRST; private Integer threshold = distance.getDefaultScriptDistance(); @Override diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java index 9fc94b1abb8..c84d8c0a2d0 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/XLocaleMatcherTest.java @@ -282,7 +282,7 @@ public class XLocaleMatcherTest extends TestFmwk { class MyTestFileHandler extends DataDrivenTestHelper { Output bestDesired = new Output(); - DistanceOption distanceOption = DistanceOption.NORMAL; + DistanceOption distanceOption = DistanceOption.REGION_FIRST; int threshold = -1; @Override @@ -305,8 +305,7 @@ public class XLocaleMatcherTest extends TestFmwk { if (breakpoint) { breakpoint = false; // put debugger breakpoint here to break at @debug in test file } - - XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL + XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.REGION_FIRST ? newXLocaleMatcher(supportedList) : newXLocaleMatcher(supportedList, threshold, distanceOption); commentBase = "(" + lineNumber + ") " + commentBase; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt index 0e3e3a582f7..55c0f3f5a96 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt @@ -334,8 +334,8 @@ und, no ; nn-BE-fonipa ; no ; no-BE-fonipa und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK -en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK -en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; en-PSCRACK # was: fr-PSCRACK +en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; en-PSCRACK # was: de-PSCRACK ################################################## # testClusters @@ -384,4 +384,4 @@ und, en-GU, en-GB, en-IN ; en-VI ; en-GU ru, fr ; zh, pl ; fr ru, fr ; zh-Cyrl, pl ; ru #hr, en-Cyrl; sr ; en-Cyrl -da, ru, hr; sr ; ru \ No newline at end of file +da, ru, hr; sr ; ru