// //new UnicodeRegex().compileBnf(pat)
// );
//
- // TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future.
+ // NOTE: Should we fix this to check for format?
+ // ANSWER: Not required, since this is only called internally. Moreover, we deliberately
+ // use invalid language tags ("x1", "x2", etc.) to represent pseudo-locales. See below.
static LSR from(String languageIdentifier) {
String[] parts = languageIdentifier.split("[-_]");
if (parts.length < 1 || parts.length > 3) {
String p3 = parts.length < 3 ? "" : parts[2];
return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
- // Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
- // if (!matcher.matches()) {
- // return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
- // }
- // System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
- // throw new ICUException("invalid language id");
+ // Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
+ // if (!matcher.matches()) {
+ // return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
+ // }
+ // System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
+ // throw new ICUException("invalid language id");
+ }
+
+ private static final HashMap<ULocale, LSR> pseudoReplacements = new HashMap<ULocale, LSR>(11);
+
+ // Note code in XLocaledistance.java handle pseudo-regions XA, XB, and XC, making them
+ // very distant from any other locale. Similarly, it establishes that any of the
+ // invalid locales below ("x1", "x2", ..., "x7", and "x8-en") are very distant
+ // from any other locale.
+ static {
+ String[][] source = {
+ {"x-bork", "x1", "", ""},
+ {"x-elmer", "x2", "", ""},
+ {"x-hacker", "x3", "", ""},
+ {"x-piglatin", "x4", "", ""},
+ {"x-pirate", "x5", "", ""},
+ {"en-XA", "x6", "", ""},
+ {"en-PSACCENT", "x6", "", ""}, // Note: same as for ex-XA
+ {"ar-XB", "x7", "", ""},
+ {"ar-PSBIDI", "x7", "", ""}, // Note: same as for ar-XB
+ {"en-XC", "x8", "en", ""}, // Note: language is stored in LSR.script field
+ {"en-PSCRACK", "x8", "en", ""}, // Note: same as for en-XC
+ };
+ for (int i = 0; i < source.length; ++i) {
+ pseudoReplacements.put(new ULocale(source[i][0]),
+ new LSR(source[i][1], source[i][2], source[i][3]));
+ }
+
}
public static LSR from(ULocale locale) {
+ LSR replacement = pseudoReplacements.get(locale);
+ if (replacement != null) {
+ return replacement;
+ }
+ // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
+ if ("PSCRACK".equals(locale.getVariant())) {
+ return new LSR(
+ "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
+ }
return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry());
}
public static LSR fromMaximalized(ULocale locale) {
+ LSR replacement = pseudoReplacements.get(locale);
+ if (replacement != null) {
+ return replacement;
+ }
+ // Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
+ if ("PSCRACK".equals(locale.getVariant())) {
+ return new LSR(
+ "x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
+ }
return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry());
}
public static final int ABOVE_THRESHOLD = 100;
+ // Activates debugging output to stderr with details of GetBestMatch.
+ // Be sure to set this to false before checking this in for production!
+ private static final boolean TRACE_DISTANCE = false;
+
@Deprecated
public static final String ANY = "�"; // matches any character. Uses value above any subtag.
@Override
public int getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals) {
+ if (TRACE_DISTANCE) {
+ System.err.printf(" Entering getDistance: desired=%s supported=%s starEquals=%s\n",
+ desired, supported, Boolean.toString(starEquals));
+ }
boolean star = false;
Map<String, DistanceNode> sub2 = subtables.get(desired);
if (sub2 == null) {
if (distanceTable != null) {
distanceTable.value = ((StringDistanceNode) value).distanceTable;
}
- return starEquals && star && desired.equals(supported) ? 0 : value.distance;
+ int result = starEquals && star && desired.equals(supported) ? 0 : value.distance;
+ if (TRACE_DISTANCE) {
+ System.err.printf(" Returning from getDistance: %d\n", result);
+ }
+ return result;
}
public void copy(StringDistanceTable other) {
buffer.append('\t').append('#').append(id).append('\n');
} else {
((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer);
+ buffer.append('\n');
}
} else {
buffer.append('\n');
* ULocales must be in canonical, addLikelySubtags format. Returns distance
*/
public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) {
- return distanceRaw(desired.language, supported.language,
+ if (TRACE_DISTANCE) {
+ System.err.printf(" Entering distanceRaw: desired=%s supported=%s "
+ + "threshold=%d preferred=%s\n",
+ desired, supported, threshold,
+ distanceOption.name());
+ }
+ int result = distanceRaw(desired.language, supported.language,
desired.script, supported.script,
desired.region, supported.region,
threshold, distanceOption);
+ if (TRACE_DISTANCE) {
+ System.err.printf(" Returning from distanceRaw: %d\n", result);
+ }
+ return result;
}
- public enum DistanceOption {NORMAL, SCRIPT_FIRST}
+ public enum DistanceOption {REGION_FIRST, SCRIPT_FIRST}
+ // NOTE: Replaced "NORMAL" with "REGION_FIRST". By default, scripts have greater weight
+ // than regions, so they might be considered the "normal" case.
/**
* Returns distance, from 0 to ABOVE_THRESHOLD.
- * ULocales must be in canonical, addLikelySubtags format. Returns distance
+ * ULocales must be in canonical, addLikelySubtags format.
+ * (Exception: internal calls may pass any strings. They do this for pseudo-locales.)
+ * Returns distance.
*/
public int distanceRaw(
String desiredLang, String supportedLang,
}
}
}
+
+ // Pseudo regions should match no other regions.
+ // {"*-*-XA", "*-*-*", "0"},
+ // {"*-*-XB", "*-*-*", "0"},
+ // {"*-*-XC", "*-*-*", "0"},
+ // {"x1-*-*", "*-*-*", "0"},
+ // {"x2-*-*", "*-*-*", "0"},
+ // ...
+ // {"x8-*-*", "*-*-*", "0"},
+ List<String> supported = Arrays.asList("*", "*", "*");
+ for (String x : Arrays.asList("XA", "XB", "XC")) {
+ List<String> desired = Arrays.asList("*", "*", x);
+ add(defaultDistanceTable, desired, supported, 100);
+ add(defaultDistanceTable, supported, desired, 100);
+ }
+ // See XLikelySubtags.java for the mapping of pseudo-locales to x1 ... x8.
+ for (int i = 1; i <= 8; ++i) {
+ List<String> desired = Arrays.asList("x" + String.valueOf(i), "*", "*");
+ add(defaultDistanceTable, desired, supported, 100);
+ add(defaultDistanceTable, supported, desired, 100);
+ }
+
if (PRINT_OVERRIDES) {
System.out.println("\t\t</languageMatches>");
}
private static final LSR UND = new LSR("und","","");
private static final ULocale UND_LOCALE = new ULocale("und");
+ // Activates debugging output to stderr with details of GetBestMatch.
+ private static final boolean TRACE_MATCHER = false;
+
// normally the default values, but can be set via constructor
private final XLocaleDistance localeDistance;
return this;
}
public Builder setSupportedLocales(Set<ULocale> languagePriorityList) {
- this.supportedLanguagesList = languagePriorityList;
+ Set<ULocale> temp = new LinkedHashSet<ULocale>(); // maintain order
+ temp.addAll(languagePriorityList);
+ this.supportedLanguagesList = temp;
return this;
}
public XLocaleMatcher build() {
return new XLocaleMatcher(this);
}
+
+ @Override
+ public String toString() {
+ StringBuilder s = new StringBuilder().append("{XLocaleMatcher.Builder");
+ if (!supportedLanguagesList.isEmpty()) {
+ s.append(" supported={").append(supportedLanguagesList.toString()).append("}");
+ }
+ if (defaultLanguage != null) {
+ s.append(" default=").append(defaultLanguage.toString());
+ }
+ if (thresholdDistance >= 0) {
+ s.append(String.format(" thresholdDistance=%d", thresholdDistance));
+ }
+ s.append(" preference=").append(distanceOption.name());
+ return s.append("}").toString();
+ }
}
/**
private Multimap<LSR,ULocale> extractLsrMap(Set<ULocale> languagePriorityList, Set<LSR> priorities) {
Multimap<LSR, ULocale> builder = LinkedHashMultimap.create();
for (ULocale item : languagePriorityList) {
- final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item);
+ final LSR max = item.equals(UND_LOCALE) ? UND :
+ LSR.fromMaximalized(item);
builder.put(max, item);
}
if (builder.size() > 1 && priorities != null) {
ULocale bestDesiredLocale = null;
Collection<ULocale> bestSupportedLocales = null;
int delta = 0;
- mainLoop:
- for (final Entry<LSR, ULocale> desiredLsrAndLocale : desiredLSRs.entries()) {
- // quick check for exact match
- ULocale desiredLocale = desiredLsrAndLocale.getValue();
- LSR desiredLSR = desiredLsrAndLocale.getKey();
- if (delta < bestDistance) {
- if (exactSupportedLocales.contains(desiredLocale)) {
- if (outputBestDesired != null) {
- outputBestDesired.value = desiredLocale;
- }
- return desiredLocale;
- }
- // quick check for maximized locale
- Collection<ULocale> found = supportedLanguages.get(desiredLSR);
- if (found != null) {
- // if we find one in the set, return first (lowest). We already know the exact one isn't there.
- if (outputBestDesired != null) {
- outputBestDesired.value = desiredLocale;
- }
- return found.iterator().next();
- }
+ mainLoop:
+ for (final Entry<LSR, Set<ULocale>> desiredLsrAndLocales : desiredLSRs.asMap().entrySet()) {
+ LSR desiredLSR = desiredLsrAndLocales.getKey();
+ for (ULocale desiredLocale : desiredLsrAndLocales.getValue()) {
+ // quick check for exact match
+ if (delta < bestDistance) {
+ if (exactSupportedLocales.contains(desiredLocale)) {
+ if (outputBestDesired != null) {
+ outputBestDesired.value = desiredLocale;
+ }
+ if (TRACE_MATCHER) {
+ System.err.printf(
+ "Returning %s, which is an exact match for a supported language\n",
+ desiredLocale);
+ }
+ return desiredLocale;
+ }
+ // quick check for maximized locale
+ Collection<ULocale> found = supportedLanguages.get(desiredLSR);
+ if (found != null) {
+ // if we find one in the set, return first (lowest). We already know the exact one isn't
+ // there.
+ if (outputBestDesired != null) {
+ outputBestDesired.value = desiredLocale;
}
- for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
- int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(),
- thresholdDistance, distanceOption);
- if (distance < bestDistance) {
- bestDistance = distance;
- bestDesiredLocale = desiredLocale;
- bestSupportedLocales = supportedLsrAndLocale.getValue();
- if (distance == 0) {
- break mainLoop;
- }
- }
+ ULocale result = found.iterator().next();
+ if (TRACE_MATCHER) {
+ System.err.printf("Returning %s\n", result.toString());
}
- delta += demotionPerAdditionalDesiredLocale;
+ return result;
+ }
}
+ for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
+ int distance =
+ delta
+ + localeDistance.distanceRaw(
+ desiredLSR,
+ supportedLsrAndLocale.getKey(),
+ thresholdDistance,
+ distanceOption);
+ if (distance < bestDistance) {
+ bestDistance = distance;
+ bestDesiredLocale = desiredLocale;
+ bestSupportedLocales = supportedLsrAndLocale.getValue();
+ if (distance == 0) {
+ break mainLoop;
+ }
+ }
+ }
+ delta += demotionPerAdditionalDesiredLocale;
+ }
+ }
if (bestDistance >= thresholdDistance) {
if (outputBestDesired != null) {
outputBestDesired.value = null;
}
+ if (TRACE_MATCHER) {
+ System.err.printf("Returning default %s\n", defaultLanguage.toString());
+ }
return defaultLanguage;
}
if (outputBestDesired != null) {
}
// pick exact match if there is one
if (bestSupportedLocales.contains(bestDesiredLocale)) {
+ if (TRACE_MATCHER) {
+ System.err.printf(
+ "Returning %s which matches a supported language\n", bestDesiredLocale.toString());
+ }
return bestDesiredLocale;
}
// otherwise return first supported, combining variants and extensions from bestDesired
- return bestSupportedLocales.iterator().next();
+ ULocale result = bestSupportedLocales.iterator().next();
+ if (TRACE_MATCHER) {
+ System.err.printf("Returning first supported language %s\n", result.toString());
+ }
+ return result;
}
/**
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
+ if (TRACE_MATCHER) {
+ System.err.printf("Exact match with a supported locale.\n");
+ }
return desiredLocale;
}
// quick check for maximized locale
- if (distanceOption == DistanceOption.NORMAL) {
+ if (distanceOption == DistanceOption.REGION_FIRST) {
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
if (found != null) {
// if we find one in the set, return first (lowest). We already know the exact one isn't there.
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
- return found.iterator().next();
+ ULocale result = found.iterator().next();
+ if (TRACE_MATCHER) {
+ System.err.printf("Matches a maximized supported locale: %s\n", result);
+ }
+ return result;
}
}
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
if (outputBestDesired != null) {
outputBestDesired.value = null;
}
+ if (TRACE_MATCHER) {
+ System.err.printf(
+ "Returning default %s because everything exceeded the threshold of %d.\n",
+ defaultLanguage, thresholdDistance);
+ }
return defaultLanguage;
}
if (outputBestDesired != null) {
return bestDesiredLocale;
}
// otherwise return first supported, combining variants and extensions from bestDesired
- return bestSupportedLocales.iterator().next();
+ ULocale result = bestSupportedLocales.iterator().next();
+ if (TRACE_MATCHER) {
+ System.err.printf("First in the list of supported locales: %s\n", result);
+ }
+ return result;
}
/** Combine features of the desired locale into those of the supported, and return result. */
newLikelyTime += System.nanoTime()-temp;
temp = System.nanoTime();
- int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL);
- int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL);
+ int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.REGION_FIRST);
+ int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.REGION_FIRST);
newTimeMinusLikely += System.nanoTime()-temp;
}
}
class MyTestFileHandler extends DataDrivenTestHelper {
final XLocaleDistance distance = XLocaleDistance.getDefault();
Output<ULocale> bestDesired = new Output<ULocale>();
- private DistanceOption distanceOption = DistanceOption.NORMAL;
+ private DistanceOption distanceOption = DistanceOption.REGION_FIRST;
private Integer threshold = distance.getDefaultScriptDistance();
@Override
class MyTestFileHandler extends DataDrivenTestHelper {
Output<ULocale> bestDesired = new Output<ULocale>();
- DistanceOption distanceOption = DistanceOption.NORMAL;
+ DistanceOption distanceOption = DistanceOption.REGION_FIRST;
int threshold = -1;
@Override
if (breakpoint) {
breakpoint = false; // put debugger breakpoint here to break at @debug in test file
}
-
- XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL
+ XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.REGION_FIRST
? newXLocaleMatcher(supportedList)
: newXLocaleMatcher(supportedList, threshold, distanceOption);
commentBase = "(" + lineNumber + ") " + commentBase;
und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK
-en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK
-en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK
+en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; en-PSCRACK # was: fr-PSCRACK
+en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; en-PSCRACK # was: de-PSCRACK
##################################################
# testClusters
ru, fr ; zh, pl ; fr
ru, fr ; zh-Cyrl, pl ; ru
#hr, en-Cyrl; sr ; en-Cyrl
-da, ru, hr; sr ; ru
\ No newline at end of file
+da, ru, hr; sr ; ru