<retain path="icudata.rc"/>
<retain path="icustd.txt"/>
<retain path="icuver.txt"/>
- <retain path="langInfo.txt"/>
<retain path="zoneinfo64.txt"/>
<!-- This file should be removed before the next release. -->
<retain path="miscfiles.mk"/>
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
+import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.nio.file.StandardOpenOption.CREATE;
}
private void open(String label, PrintWriter out) {
- newLineAndIndent(out);
+ newLineAndIndent(out, FormatOptions.PATH_FORMAT);
depth++;
// This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
// These take the form of "<any-string>" and are used to ensure that path order can be
private void close(PrintWriter out) {
depth--;
- newLineAndIndent(out);
+ newLineAndIndent(out, FormatOptions.PATH_FORMAT);
out.print('}');
}
- private void newLineAndIndent(PrintWriter out) {
+ private void newLineAndIndent(PrintWriter out, FormatOptions format) {
out.println();
- for (int i = 0; i < depth; i++) {
- out.print(INDENT);
+ if (format.shouldIndent) {
+ for (int i = 0; i < depth; i++) {
+ out.print(INDENT);
+ }
}
}
}
}
+ private static final class FormatOptions {
+ // Only the indent flag is used
+ final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true);
+
+ static FormatOptions forPath(RbPath rbPath) {
+ return new FormatOptions(
+ !rbPath.isIntPath() && !rbPath.isBinPath(),
+ !rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(),
+ !rbPath.isBinPath());
+ }
+
+ final boolean shouldQuote;
+ final boolean shouldUseComma;
+ final boolean shouldIndent;
+
+ private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) {
+ this.shouldQuote = shouldQuote;
+ this.shouldUseComma = shouldUseComma;
+ this.shouldIndent = shouldIndent;
+ }
+ }
+
/** Inserts padding and values between braces. */
+ // TODO: Get rid of the need for icuDataName by adding type information to RbPath.
private boolean appendValues(
- String name, RbPath rbPath, List<RbValue> values, PrintWriter out) {
+ String icuDataName, RbPath rbPath, List<RbValue> values, PrintWriter out) {
RbValue onlyValue;
boolean wasSingular = false;
- boolean quote = !rbPath.isIntPath();
- boolean isSequence = rbPath.endsWith(RB_SEQUENCE);
- if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
+ FormatOptions format = FormatOptions.forPath(rbPath);
+ if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) {
onlyValue = values.get(0);
- if (onlyValue.isSingleton() && !mustBeArray(false, name, rbPath)) {
+ if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) {
// Value has a single element and is not being forced to be an array.
String onlyElement = Iterables.getOnlyElement(onlyValue.getElements());
- if (quote) {
+ if (format.shouldQuote) {
onlyElement = quoteInside(onlyElement);
}
// The numbers below are simply tuned to match the line wrapping in the original
int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
if (onlyElement.length() <= maxWidth) {
// Single element for path: don't add newlines.
- printValue(out, onlyElement, quote);
+ printValue(out, onlyElement, format);
wasSingular = true;
} else {
// Element too long to fit in one line, so wrap.
for (int i = 0; i < onlyElement.length(); i = end) {
end = goodBreak(onlyElement, i + maxWidth);
String part = onlyElement.substring(i, end);
- newLineAndIndent(out);
- printValue(out, part, quote);
+ newLineAndIndent(out, format);
+ printValue(out, part, format);
}
}
} else {
// Only one array for the rbPath, so don't add an extra set of braces.
- printArray(onlyValue, quote, isSequence, out);
+ printElements(out, onlyValue, format);
}
} else {
for (RbValue value : values) {
if (value.isSingleton()) {
// Single-value array: print normally.
- printArray(value, quote, isSequence, out);
+ printElements(out, value, format);
} else {
// Enclose this array in braces to separate it from other values.
open("", out);
- printArray(value, quote, isSequence, out);
+ printElements(out, value, format);
close(out);
}
}
|| rbPath.startsWith(RB_METAZONE_INFO);
}
- private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) {
- for (String v : rbValue.getElements()) {
- newLineAndIndent(out);
- printValue(out, quoteInside(v), quote);
- if (!isSequence) {
- out.print(",");
+ private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) {
+ // TODO: If "shouldUseComma" is made obsolete, just use the "else" block always.
+ if (rbValue.getElementsPerLine() == 1) {
+ for (String v : rbValue.getElements()) {
+ newLineAndIndent(out, format);
+ printValue(out, quoteInside(v), format);
+ if (format.shouldUseComma) {
+ out.print(",");
+ }
+ }
+ } else {
+ checkArgument(format.shouldUseComma, "cannot group non-sequence values");
+ Iterable<List<String>> partitions =
+ Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine());
+ for (List<String> tuple : partitions) {
+ newLineAndIndent(out, format);
+ for (String v : tuple) {
+ printValue(out, quoteInside(v), format);
+ out.print(",");
+ }
}
}
}
- private static void printValue(PrintWriter out, String value, boolean quote) {
- if (quote) {
+ private static void printValue(PrintWriter out, String value, FormatOptions format) {
+ if (format.shouldQuote) {
out.append('"').append(value).append('"');
} else {
out.append(value);
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
+import org.unicode.icu.tool.cldrtoicu.localedistance.LocaleDistanceMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper;
PLURAL_RANGES(SUPPLEMENTAL),
WINDOWS_ZONES(SUPPLEMENTAL),
TRANSFORMS(SUPPLEMENTAL),
+ LOCALE_DISTANCE(SUPPLEMENTAL),
KEY_TYPE_DATA(BCP47);
public static final ImmutableSet<OutputType> ALL = ImmutableSet.copyOf(OutputType.values());
write(PluralRangesMapper.process(src), "misc");
break;
+ case LOCALE_DISTANCE:
+ write(LocaleDistanceMapper.process(src), "misc");
+ break;
+
case WINDOWS_ZONES:
processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false);
break;
}
// TODO: Remove this and isAlias() in favour of having properly typed paths.
- boolean isIntPath() {
- String lastElement = segments.get(segments.size() - 1);
- return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
+ public boolean isIntPath() {
+ return typeSuffixIsAnyOf(":int", ":intvector");
+ }
+
+ public boolean isBinPath() {
+ return typeSuffixIsAnyOf(":bin");
}
public boolean isAlias() {
- return getSegment(length() - 1).endsWith(":alias");
+ return typeSuffixIsAnyOf(":alias");
+ }
+
+ private boolean typeSuffixIsAnyOf(String... types) {
+ String lastElement = getSegment(length() - 1);
+ for (String type : types) {
+ if (lastElement.endsWith(type)) {
+ return true;
+ }
+ }
+ return false;
}
@Override public int compareTo(RbPath other) {
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.collect.ImmutableList.toImmutableList;
-import java.util.Arrays;
import java.util.Objects;
+import java.util.stream.Stream;
import com.google.common.collect.ImmutableList;
*/
public final class RbValue {
private final ImmutableList<String> elements;
+ private final int elementsPerLine;
/** Returns a resource bundle value of the given elements. */
public static RbValue of(String... elements) {
- return of(Arrays.asList(elements));
+ return new RbValue(ImmutableList.copyOf(elements), 1);
}
/** Returns a resource bundle value of the given elements. */
public static RbValue of(Iterable<String> elements) {
- return new RbValue(elements);
+ return new RbValue(ImmutableList.copyOf(elements), 1);
}
- private RbValue(Iterable<String> elements) {
- this.elements = ImmutableList.copyOf(elements);
- checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty");
+ /** Returns a resource bundle value of the given elements by consuming the given stream. */
+ public static RbValue of(Stream<String> elements) {
+ return new RbValue(elements.collect(toImmutableList()), 1);
+ }
+
+ private RbValue(ImmutableList<String> elements, int elementsPerLine) {
+ checkArgument(!elements.isEmpty(), "Resource bundle values cannot be empty");
+ checkArgument(elementsPerLine > 0, "invalid elements per line: %s", elementsPerLine);
+ this.elements = elements;
+ this.elementsPerLine = elementsPerLine;
+ }
+
+ public RbValue elementsPerLine(int n) {
+ return new RbValue(elements, n);
}
/** Returns the non-empty list of value elements. */
* Returns whether this is a single element value. Singleton values are treated different when
* writing out ICU data files.
*/
- public boolean isSingleton() {
+ boolean isSingleton() {
return elements.size() == 1;
}
+ int getElementsPerLine() {
+ return elementsPerLine;
+ }
+
@Override public int hashCode() {
return Objects.hashCode(elements);
}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static com.ibm.icu.impl.locale.LocaleDistance.DISTANCE_SKIP_SCRIPT;
+import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_LANG_DISTANCE;
+import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_REGION_DISTANCE;
+import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_SCRIPT_DISTANCE;
+import static com.ibm.icu.impl.locale.LocaleDistance.IX_LIMIT;
+import static com.ibm.icu.impl.locale.LocaleDistance.IX_MIN_REGION_DISTANCE;
+import static java.util.Arrays.asList;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Table;
+import com.google.common.collect.TreeBasedTable;
+
+/**
+ * Represents the conceptual distance between pairs of language specifications.
+ *
+ * <p>Mappings for {@code (desired, supported)} pairs are added at one of three
+ * levels in the table; language, script and region. Distances can be resolved at
+ * any level in the table (e.g. {@code ("en","fr")}, {@code ("en_Latn","ru_Cyrl")}
+ * or {@code ("en_Latn_GB", "en_Latn_AU")}).
+ *
+ * <p>However in reality the "regions" in the table are actually "partition IDs"
+ * representing groups of regions with the same language characteristics. For more
+ * information on partitions and how they are generated, see {@link PartitionInfo}.
+ *
+ * <p>This is mentioned here because anyone debugging this code might be surprised
+ * to see values like {@code "5"} for a "region" in the code. Using the term
+ * "region" matches the conceptual level of the data and is more familiar to most
+ * people, whereas "partition ID" would probably be jarring.
+ *
+ * <p>The builder class is not resuable, and once a table is built, the builder is
+ * invalid. Furthermore, since the table data itself is mutable, care must be taken
+ * to avoid modifying either the Trie or the returned distance array.
+ *
+ * <p>Note that internally the {@code '*'} character used as a wildcard for subtags
+ * is replaced by the {@code '�'} character (a.k.a ANY), whenever a subtag is
+ * passed into the API. This is because the underlying Trie structure generated by
+ * the distance table reserves {@code '*'} for a different purpose. This difference
+ * is encapsulated within this class and the {@link Trie} class only.
+ */
+final class DistanceTable {
+ private static final Logger logger = Logger.getLogger(DistanceTable.class.getName());
+
+ // Represents a wildcard match in the data table (the equivalent of '*' in
+ // <languageMatch> locale subtag). Any incoming subtags are normalized to
+ // convert '*' to this character by the builder.
+ private static final String ANY = "�";
+
+ // Distances must be in the range [0-127] because bit 7 of the distance value
+ // is used for a special flag (DISTANCE_SKIP_SCRIPT). Setting the explicit max
+ // to 100 is just a more human readable maximum that satisfies that constraint.
+ private static final int MAX_REGION_DISTANCE = 100;
+
+ static final class Builder {
+ private final Node rootNode = new Node(-1);
+ private int minRegionDistance = MAX_REGION_DISTANCE;
+
+ private Builder() {}
+
+ /**
+ * Adds a distance to the table between the specified and desired tuples.
+ * This method takes 1, 2 or 3 sequential {@code (desired, supported)} pairs
+ * of values corresponding to language subtags, script subtags and regions
+ * (partition IDs). All values can be the wildcard '*'.
+ */
+ public void addDistance(int distance, boolean oneway, String... args) {
+ MappingKey key = MappingKey.fromSubtags(args, distance);
+ logger.fine(key::toString);
+ // Minimum region distance needs to be tracked specially.
+ if (key.getDepth() == 3 && distance < minRegionDistance) {
+ minRegionDistance = distance;
+ }
+ addMapping(key);
+ if (!oneway && !key.isSymmetrical()) {
+ addMapping(key.reverse());
+ }
+ }
+
+ private void addMapping(MappingKey key) {
+ rootNode.addExplicitMapping(key);
+ if (key.hasWildcardMappings()) {
+ rootNode.addWildcardMappings(key);
+ }
+ }
+
+ /** Returns the final minimized distance table information. */
+ public DistanceTable build() {
+ Node defLangNode = rootNode.getAnyNode();
+ checkState(defLangNode != null, "missing default language mapping: %s", rootNode);
+ Node defScriptNode = defLangNode.getAnyNode();
+ checkState(defScriptNode != null, "missing default script mapping: %s", rootNode);
+ Node defRegionNode = defScriptNode.getAnyNode();
+ checkState(defRegionNode != null, "missing default region mapping: %s", rootNode);
+
+ // Because we prune the data table, it's important to store the default
+ // distance values separately.
+ int[] distances = new int[IX_LIMIT];
+ distances[IX_DEF_LANG_DISTANCE] = defLangNode.distance;
+ distances[IX_DEF_SCRIPT_DISTANCE] = defScriptNode.distance;
+ distances[IX_DEF_REGION_DISTANCE] = defRegionNode.distance;
+ distances[IX_MIN_REGION_DISTANCE] = minRegionDistance;
+
+ // Having determined the distances, prune the Trie to remove any sub-tables
+ // where distances could only be determined to be the default value (i.e.
+ // where the existence of that sub-table has no effect).
+ pruneDefaultDistances(defScriptNode.distance, defRegionNode.distance);
+ return new DistanceTable(rootNode, distances);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("minimum region distance: %d\n%s\n", minRegionDistance, rootNode);
+ }
+
+ private void pruneDefaultDistances(int defScriptDistance, int defRegionDistance) {
+ logger.fine("==== pruning subtables ====");
+ rootNode.subtables.values().forEach(langNode -> {
+ langNode.subtables.values().forEach(scriptNode -> {
+ if (scriptNode.subtables.size() == 1) {
+ // If a script node *only* contains region data with the default
+ // region distance, that region data can be removed. Since region
+ // is the lowest level, there's no need to worry about "skipping"
+ // anything during lookup (unlike the case below).
+ Node defRegionNode = scriptNode.getAnyNode();
+ checkState(defRegionNode != null,
+ "missing default region node for script: %s", scriptNode);
+ if (defRegionNode.distance == defRegionDistance) {
+ scriptNode.subtables.clear();
+ }
+ }
+ });
+ // Do the pruning in the "upwards" phase of visitation (after recursion) so
+ // if script subtables are pruned, it's visible here.
+ if (langNode.subtables.size() == 1) {
+ // If a language node *only* contains script data with the default
+ // script distance, we can't just remove it (because it might contain
+ // region data).
+ Node defScriptNode = langNode.getAnyNode();
+ if (defScriptNode.distance == defScriptDistance) {
+ checkState(defScriptNode != null,
+ "missing default script node for language: %s", langNode);
+ if (defScriptNode.subtables.isEmpty()) {
+ // If the default script node has no region data, remove it.
+ langNode.subtables.clear();
+ } else {
+ // Otherwise mark script data as "skippable", which indicates
+ // it should be written in a compact form in the Trie (while
+ // retaining any region data as normal).
+ langNode.distance |= DISTANCE_SKIP_SCRIPT;
+ }
+ }
+ }
+ });
+ // After pruning we don't expect any data in the top-level default table.
+ checkState(rootNode.getAnyNode().subtables.isEmpty(),
+ "invalid table state: %s", rootNode.getAnyNode());
+ rootNode.subtables.rowMap().remove(ANY);
+ }
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ private final Node rootNode;
+ private final int[] distances;
+
+ private DistanceTable(Node rootNode, int[] distances) {
+ this.rootNode = rootNode;
+ this.distances = distances;
+ }
+
+ public Trie getTrie() {
+ Trie trie = new Trie();
+ rootNode.writeTo(trie.root());
+ return trie;
+ }
+
+ public int[] getDefaultDistances() {
+ return distances;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("default distances: %s\n%s\n", Arrays.toString(distances), rootNode);
+ }
+
+ private static final class Node {
+ private final Table<String, String, Node> subtables = TreeBasedTable.create();
+ // Distance for the lookup so far (-1 for top level nodes).
+ private int distance;
+
+ Node(int distance) {
+ checkArgument(distance >= -1, "invalid distance: %s", distance);
+ this.distance = distance;
+ }
+
+ /** Returns the subtable node for the top-level mapping of a key. */
+ private Node getNode(MappingKey key) {
+ return subtables.get(key.getDesired(), key.getSupported());
+ }
+
+ /** Returns the subtable node for the {@code <ANY,ANY>} mapping. */
+ Node getAnyNode() {
+ return subtables.get(ANY, ANY);
+ }
+
+ void addExplicitMapping(MappingKey key) {
+ if (key.isLeaf()) {
+ if (!putIfAbsent(key)) {
+ logger.fine(() -> String.format("Ignore existing mapping: %s", key));
+ }
+ } else {
+ getIntermediateNode(key).addExplicitMapping(key.getSuffix());
+ }
+ }
+
+ void addWildcardMappings(MappingKey key) {
+ if (key.isLeaf()) {
+ putIfAbsent(key);
+ } else if (key.isWildcard()) {
+ // An intermediate wildcard mapping is applied to all existing sub-nodes.
+ // NOTE: This will need to change if we want to support "mixed" wildcard mappings.
+ for (Node node : subtables.values()) {
+ node.addWildcardMappings(key.getSuffix());
+ }
+ } else {
+ // An explicit intermediate mapping only affects an existing exact match.
+ Node node = getNode(key);
+ if (node != null) {
+ node.addWildcardMappings(key.getSuffix());
+ }
+ }
+ }
+
+ /**
+ * Adds a new mapping to this node with the specified distance if it didn't already
+ * exist.
+ *
+ * <p>Note: If a mapping already exists, then this method has no effect (even if the
+ * existing distance differs from the given distance). This is necessary to for two
+ * reasons:
+ * <ol>
+ * <li>An earlier match rule may have set an explicit value for the mapping,
+ * and we subsequently try to set a default value (via a wildcard mapping).
+ * This should be ignored, since we want the non-default value to win.
+ * This means it's important to always have explicit {@code <languageMatch>}
+ * rules before any related wildcard rules in the CLDR data.
+ *
+ * <li>A preferential {@code <languageMatch>} rule appears earlier in CLDR data.
+ * This occurs because of the way partitions are defined and allows for two
+ * distinct {@code <languageMatch>} rules to generate the same mapping (with
+ * different distances). This is because region variables reference sets of
+ * partition IDs and these are not always disjoint (e.g. "en_*_$!enUS" and
+ * "en_*_GB" both contain the partition ID for "GB").
+ * </ol>
+ *
+ * @return true if a new mapping was added, or if the distances were equal (i.e.
+ * the operation was idempotent).
+ */
+ private boolean putIfAbsent(MappingKey key) {
+ Node node = getNode(key);
+ if (node == null) {
+ logger.fine(() -> String.format("add: %s", key));
+ subtables.put(key.getDesired(), key.getSupported(), new Node(key.getDistance()));
+ return true;
+ }
+ return (key.getDistance() == node.distance);
+ }
+
+ /**
+ * Returns a sub-node corresponding to the given {@code (desired, supported)} mapping.
+ * If the node already exists, it is simply returned, otherwise a new node is created
+ * and any existing wildcard mappings are copied into it.
+ */
+ private Node getIntermediateNode(MappingKey key) {
+ Node node = getNode(key);
+ if (node == null) {
+ // This is expected to succeed because match rules are given in length
+ // order (i.e. language only before language+script etc.) and we always
+ // expect each group to end with an <ANY,ANY> mapping for the default
+ // distance. Thus, for any longer match rule, we should find (at least)
+ // the <ANY,ANY> node when looking for intermediate nodes.
+ //
+ // NOTE: Currently (desired==ANY) if-and-only-if (supported=ANY), so the
+ // only non-exact match we can get here is the <ANY,ANY> node. If we ever
+ // allow a mix of wildcard/non-wildcard keys, replace the getAnyNode() call
+ // with something like the line below:
+ // ----
+ // Node wildcardMatch = Iterables.find(
+ // asList(getNode(desired, ANY), getNode(ANY, supported), getNode(ANY,ANY)),
+ // Objects::nonNull);
+ // ----
+ Node wildcardMatch = getAnyNode();
+ checkState(wildcardMatch != null, "missing <ANY,ANY> mapping: %s", this);
+ // Default distances are the distance between any two *different* unknown
+ // subtags (so if the subtags are the same, the distance is zero).
+ int distance = key.getDesired().equals(key.getSupported()) ? 0 : wildcardMatch.distance;
+ node = new Node(distance);
+ node.copySubtablesFrom(wildcardMatch);
+ subtables.put(key.getDesired(), key.getSupported(), node);
+ }
+ return node;
+ }
+
+ /** Copies all subtable mappings from the given node into this one. */
+ private void copySubtablesFrom(Node src) {
+ checkState(subtables.isEmpty());
+ src.subtables.cellSet().forEach(
+ c -> subtables.put(c.getRowKey(), c.getColumnKey(), new Node(c.getValue().distance)));
+ }
+
+ /**
+ * Writes all the mappings in the distance table sequentially to given Trie in sorted
+ * table order.
+ *
+ * <p>Mappings are written in a top-down recursive visitation with sub-tables inheriting
+ * the current prefix from parent tables via the given Trie span. At each level any
+ * mapped distances are written before recursing into the sub-tables.
+ */
+ private void writeTo(Trie.Span trieSpan) {
+ if (distance >= 0 && (distance & DISTANCE_SKIP_SCRIPT) != 0) {
+ // If a node has a distance set and has been explicitly marked as "skippable",
+ // then write the "default" subtable using the current Trie prefix (effectively
+ // having an "empty" prefix for this case).
+ getAnyNode().writeTo(trieSpan);
+ } else {
+ // In the normal case, just write the mappings explicitly.
+ subtables.rowMap().forEach(
+ (desired, supportedNodes) -> writeSupported(trieSpan, desired, supportedNodes));
+ }
+ }
+
+ private void writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes) {
+ // Collapse any (desired=ANY, supported=ANY) mappings into a single '*' in the trie.
+ if (desired.equals(ANY)) {
+ // If desired is ANY, the only supported subtag must also be ANY.
+ Node node = supportedNodes.get(ANY);
+ checkState(node != null && supportedNodes.size() == 1,
+ "invalid supported subtags for desired='ANY': %s", supportedNodes);
+ // Remember that ANY != "*", even though it corresponds to "*" in the original
+ // language match rules. Putting "*" in a Trie means something different (but
+ // similar enough to be a bit confusing).
+ trieSpan.with("*", node::writeDistancePlusSubtables);
+ } else {
+ // In the general case, just write the <desired,supported> distance mapping.
+ trieSpan.with(desired, withDesiredSpan ->
+ supportedNodes.forEach((supported, node) -> {
+ checkState(!supported.equals(ANY),
+ "unexpected supported='ANY' subtag: %s", supported);
+ withDesiredSpan.with(supported, node::writeDistancePlusSubtables);
+ })
+ );
+ }
+ }
+
+ // Writes the distance of this node to the given trie, then recursively writes any
+ // subtable information.
+ private void writeDistancePlusSubtables(Trie.Span trieSpan) {
+ trieSpan.putPrefixAndValue(distance);
+ writeTo(trieSpan);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder("distance: ").append(distance).append('\n');
+ return appendToString("", buffer).toString();
+ }
+
+ private StringBuilder appendToString(String indent, StringBuilder buffer) {
+ // Top level values are not padded with tabs.
+ String rowIndent = indent.isEmpty() ? "" : "\t";
+ for (Map.Entry<String, Map<String, Node>> row : subtables.rowMap().entrySet()) {
+ buffer.append(rowIndent).append(row.getKey());
+ // First column extends the current row, so single tab indent.
+ String colIndent = "\t";
+ for (Map.Entry<String, Node> col : row.getValue().entrySet()) {
+ buffer.append(colIndent).append(col.getKey());
+ Node subnode = col.getValue();
+ buffer.append('\t').append(subnode.distance);
+ // Append any sub-nodes (starting on the same line).
+ subnode.appendToString(indent + "\t\t\t", buffer).append('\n');
+ // Later columns need full indent (including skipping row key).
+ colIndent = indent + '\t';
+ }
+ // Later rows need full indent.
+ rowIndent = indent;
+ }
+ return buffer;
+ }
+ }
+
+ /**
+ * Excapsulates a sequence of {@code <desired,supported>} pairwise mappings over
+ * language, script and region, with an associated distance. This is an alternate
+ * way to represent a mapping of desired and supported language match rules.
+ *
+ * <p>For example:
+ * <pre>{@code
+ * <languageMatch desired="en_*_$!enUS", supported="en_*_$GB", distance="3"/>
+ * }</pre>
+ * results in a set of keys of the form:
+ * <pre>{@code
+ * <en,en> -> <ANY,ANY> -> <X,Y> = 3
+ * }</pre>
+ * where the "region" part {@code <X,Y>} is constructed from all the possible
+ * combinations of partition IDs associated with the original region variables.
+ *
+ * <p>Mapping keys have several useful properties:
+ * <ul>
+ * <li>They can be reversed (e.g. {@code <A,B> -> <C,D> = N} becomes
+ * {@code <B,A> -> <D,C> = N}).
+ * <li>They can be symmetrical (e.g. {@code <X,X> -> <Y,Y> = N}), in which
+ * case the reversed key is the same as the original.
+ * <li>They can have wildcard mappings (i.e. {@code <ANY,ANY>}).
+ * <li>They can produce "suffix" keys (e.g. the suffix of
+ * {@code <A,B> -> <C,D> = N} is {@code <C,D> = N}).
+ * </ul>
+ */
+ private static final class MappingKey {
+ /**
+ * Returns a new key from the specified subtag pairs, converting {@code '*'}
+ * subtags to the special {@code ANY} string and performing consistency checks.
+ *
+ * @param subtagPairs a sequence of {@code <desired,suported>} pairs.
+ * @param distance the distance associated with the subtag mapping.
+ */
+ static MappingKey fromSubtags(String[] subtagPairs, int distance) {
+ int pairCount = subtagPairs.length;
+ checkArgument(pairCount == 2 || pairCount == 4 || pairCount == 6,
+ "invalid number of arguments (expected 1, 2 or 3 pairs): %s", asList(subtagPairs));
+ ImmutableList.Builder<String> keyPairs = ImmutableList.builder();
+ for (String subtag : subtagPairs) {
+ keyPairs.add(fixAny(subtag));
+ }
+ return new MappingKey(keyPairs.build(), distance, false);
+ }
+
+ // Converts a '*' (from a subtag) into the wildcard match character used by the Trie.
+ // The Trie uses '*' to mean something else, so we convert it at the boundary.
+ private static String fixAny(String subtag) {
+ return subtag.equals("*") ? ANY : subtag;
+ }
+
+ private final ImmutableList<String> pairs;
+ private final int distance;
+ private final boolean isReversed;
+ private final boolean isSymmetrical;
+ private final boolean hasWildcardMappings;
+
+ private MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed) {
+ this.pairs = pairs;
+ this.distance = distance;
+ this.isReversed = isReversed;
+ checkArgument(distance >= 0 && distance <= MAX_REGION_DISTANCE,
+ "invalid mapping key distance: %s", distance);
+ // Check that if a key has "ANY" mappings, it is consistent. We expect to only
+ // get <ANY,ANY> pairs (e.g. not <X,ANY> or <ANY,X>).
+ boolean isSymmetrical = true;
+ boolean hasWildcardMappings = false;
+ for (int i = 0; i < pairs.size(); i += 2) {
+ String desired = pairs.get(i);
+ String supported = pairs.get(i + 1);
+ checkArgument(desired.equals(ANY) == supported.equals(ANY),
+ "invalid mapping key pairs: %s", pairs);
+ hasWildcardMappings |= desired.equals(ANY);
+ isSymmetrical &= desired.equals(supported);
+ }
+ this.isSymmetrical = isSymmetrical;
+ this.hasWildcardMappings = hasWildcardMappings;
+ }
+
+ /** Returns the "desired" value of the current (top-level) mapping. */
+ String getDesired() {
+ return pairs.get(isReversed ? 1 : 0);
+ }
+
+ /** Returns the "supported" value of the current (top-level) mapping. */
+ String getSupported() {
+ return pairs.get(isReversed ? 0 : 1);
+ }
+
+ /** Returns the non-negative distance mapped to by this key. */
+ int getDistance() {
+ return distance;
+ }
+
+ /**
+ * Returns the number of {@code <desired,supported>} mappings in this key; this is
+ * either 1 (language-only), 2 (language & script) or 3 (language, script & region).
+ */
+ int getDepth() {
+ return pairs.size() / 2;
+ }
+
+ /** Returns true if this key does not have a suffix. */
+ boolean isLeaf() {
+ return getDepth() == 1;
+ }
+
+ /**
+ * Returns if any of the {@code <desired,supported>} mappings are {@code <ANY,ANY>}.
+ */
+ boolean hasWildcardMappings() {
+ return hasWildcardMappings;
+ }
+
+ /**
+ * Returns if the top-level {@code <desired,supported>} mapping is {@code <ANY,ANY>}.
+ */
+ boolean isWildcard() {
+ return getDesired().equals(ANY);
+ }
+
+ /**
+ * Returns if this key is pair-wise symmetrical (e.g. {@code "<X,X> -> <Y,Y> = N"}).
+ * Symmetrical mappings don't need to be added in reverse.
+ */
+ boolean isSymmetrical() {
+ return isSymmetrical;
+ }
+
+ /** Returns a new key where each {@code <desired,supported>} mapping is reversed. */
+ MappingKey reverse() {
+ checkState(!isReversed, "cannot revese a reversed key");
+ return new MappingKey(pairs, distance, true);
+ }
+
+ /**
+ * Returns the suffix of this non-leaf key with the top-level mapping removed. For
+ * example, the suffix of {@code "<A,B> -> <C,D> = N"} is {@code "<C,D> = N"}).
+ */
+ MappingKey getSuffix() {
+ checkState(!isLeaf(), "cannot get 'next' for an empty key");
+ return new MappingKey(pairs.subList(2, pairs.size()), distance, isReversed);
+ }
+
+ @Override
+ public String toString() {
+ return isLeaf()
+ ? String.format("<%s, %s> = %d", getDesired(), getSupported(), getDistance())
+ : String.format("<%s, %s> -> %s", getDesired(), getSupported(), getSuffix());
+ }
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+
+/**
+ * Returns a canonicalized value for each unique value encountered, the memoized value is
+ * created using the zero-based index of the value and the given transformation function.
+ */
+final class Indexer<T, R> implements Function<T, R> {
+ /** Returns a plain indexer which returns the index directly. */
+ public static <T> Indexer<T, Integer> create() {
+ return create(Function.identity());
+ }
+
+ /** Returns an indexer which transforms the returned index by the given function. */
+ public static <T, R> Indexer<T, R> create(Function<Integer, R> convertIndexFn) {
+ return new Indexer<>(convertIndexFn);
+ }
+
+ private final Map<T, Integer> indexMap = new LinkedHashMap<>();
+ private final Function<Integer, R> convertIndexFn;
+
+ private Indexer(Function<Integer, R> convertIndexFn) {
+ this.convertIndexFn = checkNotNull(convertIndexFn);
+ }
+
+ /** Memoizes the given value and returns the derived value. */
+ @Override
+ public R apply(T value) {
+ indexMap.putIfAbsent(checkNotNull(value), indexMap.size());
+ return convertIndexFn.apply(indexMap.get(value));
+ }
+
+ /** Returns a set of the indexed values, in the order they were first encountered. */
+ public Set<T> getValues() {
+ return Collections.unmodifiableSet(indexMap.keySet());
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.google.common.collect.ImmutableSortedMap;
+import com.ibm.icu.impl.locale.LSR;
+import com.ibm.icu.impl.locale.XLikelySubtags;
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.PathMatcher;
+
+import java.util.Comparator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static com.google.common.base.Preconditions.*;
+import static com.google.common.base.Strings.nullToEmpty;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+
+/**
+ * Generates likely subtag information from CLDR supplemental data.
+ *
+ * <p>Likely subtag information and language aliases are combined to produce a
+ * Trie table of lookup data to canonicalize any incoming language ID to its
+ * most likely fully qualified form.
+ */
+final class LikelySubtagsBuilder {
+ private static final Logger logger = Logger.getLogger(LikelySubtagsBuilder.class.getName());
+
+ private static final PathMatcher ALIAS =
+ PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]");
+
+ private static final PathMatcher LIKELY_SUBTAG =
+ PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]");
+ private static final AttributeKey SUBTAG_FROM = AttributeKey.keyOf("likelySubtag", "from");
+ private static final AttributeKey SUBTAG_TO = AttributeKey.keyOf("likelySubtag", "to");
+
+ // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", "").
+ private static final Splitter LIST_SPLITTER =
+ Splitter.on(' ').trimResults().omitEmptyStrings();
+
+ // A language identifier is "xx", "xx_Yyyy", "xx_ZZ" or "xx_Yyyy_ZZ".
+ private static final Pattern LOCALE_ID =
+ Pattern.compile("([a-z]{2,3})(?:_([A-Z][a-z]{3}))?(?:_([A-Z]{2}|[0-9]{3}))?");
+
+ // While likely subtags are only separated by '_', language aliases can use '-' for
+ // legacy values. E.g.:
+ // <languageAlias type="zh-min" replacement="nan-x-zh-min" reason="legacy"/>
+ // Territory aliases never have a separator, so are always "simple".
+ private static final CharMatcher ALIAS_SEPARATOR = CharMatcher.anyOf("-_");
+
+ // This is a bit of a hack to let this newer implementation behave exactly like the original
+ // ICU4J version of the code. In particular, this version of the code normalizes the keys of
+ // the LSR table to "*" earlier than before (previously the "special" keys were "und" for
+ // the top-level language subtags and "" for script or region). By normalizing earlier,
+ // there's no longer any reason to have special case code in the Trie logic, but if we just
+ // do that, the table keys are now sorted differently.
+ //
+ // Normally sort order wouldn't matter, when writing the Trie, but in order to demonstrate
+ // that this code produces the same binary output as before, the old ordering is replicated.
+ //
+ // TODO: When the dust settles, consider moving this to a star-first or star-last ordering??
+ private static Comparator<String> sortingStarLike(String t) {
+ return Comparator.comparing(x -> x.equals("*") ? t : x);
+ }
+
+ private static final Comparator<String> LSR_TABLE_ORDER = sortingStarLike("und");
+ private static final Comparator<String> SUBTABLE_ORDER = sortingStarLike("");
+
+ /** Possible alias types. */
+ private enum AliasType {
+ LANGUAGE("languageAlias"),
+ TERRITORY("territoryAlias");
+
+ private final String elementName;
+ private final AttributeKey typeKey;
+ private final AttributeKey reasonKey;
+ private final AttributeKey replacementKey;
+
+ AliasType(String elementName) {
+ this.elementName = elementName;
+ this.typeKey = AttributeKey.keyOf(elementName, "type");
+ this.reasonKey = AttributeKey.keyOf(elementName, "reason");
+ this.replacementKey = AttributeKey.keyOf(elementName, "replacement");
+ }
+ }
+
+ /** Alias mappings for base languages and territories. */
+ private static final class Aliases {
+ /**
+ * Returns the alias mapping for the given type. Note that for language aliases,
+ * only "simple" aliases (between base languages) are mapped.
+ */
+ public static Aliases getAliases(CldrData supplementalData, AliasType type) {
+ ImmutableSortedMap.Builder<String, String> canonicalMap =
+ ImmutableSortedMap.naturalOrder();
+ supplementalData.accept(DTD, v -> {
+ CldrPath path = v.getPath();
+ if (ALIAS.matches(path) && path.getName().equals(type.elementName)) {
+ // TODO: Find out why we ignore "overlong" aliases?
+ String aliasFrom = v.get(type.typeKey);
+ if (isSimpleAlias(aliasFrom) && !v.get(type.reasonKey).equals("overlong")) {
+ // Replacement locale IDs must be non-empty (but can be a list) and we
+ // use only the first (default) mapping.
+ String aliasTo = LIST_SPLITTER.splitToList(v.get(type.replacementKey)).get(0);
+ if (isSimpleAlias(aliasTo)) {
+ canonicalMap.put(aliasFrom, aliasTo);
+ }
+ }
+ }
+ });
+ return new Aliases(canonicalMap.build());
+ }
+
+ // A simple language alias references only a base language (territory alias are
+ // always "simple" so this check is harmless).
+ private static boolean isSimpleAlias(String localeId) {
+ return ALIAS_SEPARATOR.matchesNoneOf(localeId);
+ }
+
+ private final ImmutableSortedMap<String, String> toCanonical;
+ private final ImmutableSetMultimap<String, String> toAliases;
+
+ private Aliases(ImmutableSortedMap<String, String> toCanonical) {
+ this.toCanonical = checkNotNull(toCanonical);
+ this.toAliases = toCanonical.asMultimap().inverse();
+ }
+
+ /** Returns the alias-to-canonical-value mapping. */
+ public ImmutableSortedMap<String, String> getCanonicalMap() {
+ return toCanonical;
+ }
+
+ /**
+ * Returns the aliases for a given canonical value (if there are no aliases
+ * then a singleton set containing the given canonical value is returned).
+ */
+ public ImmutableSet<String> getAliases(String canonical) {
+ ImmutableSet<String> aliases = toAliases.get(canonical);
+ return aliases.isEmpty() ? ImmutableSet.of(canonical) : aliases;
+ }
+ }
+
+ public static XLikelySubtags.Data build(CldrData supplementalData) {
+ // Build the table of LSR data from CLDR aliases and likely subtag information.
+ Aliases languageAliases = Aliases.getAliases(supplementalData, AliasType.LANGUAGE);
+ Aliases regionAliases = Aliases.getAliases(supplementalData, AliasType.TERRITORY);
+ Map<String, Map<String, Map<String, LSR>>> lsrTable =
+ makeTable(languageAliases, regionAliases, supplementalData);
+
+ // In the output Trie we must reference LSR instance by their special index
+ // (which is calculated by client code in order to lookup values).
+ //
+ // Note: We could pre-load this indexer with common locales to give them small
+ // indices, and see if that improves performance a little.
+ Indexer<LSR, Integer> lsrToIndex = Indexer.create();
+
+ // Reserve index 0 as "no value":
+ // The runtime lookup returns 0 for an intermediate match with no value, so we
+ // need that index to be reserved by something (but the value is arbitrary).
+ lsrToIndex.apply(lsr("", "", ""));
+ // Reserve index 1 for SKIP_SCRIPT:
+ // The runtime lookup returns 1 for an intermediate match with a value.
+ // This value is also arbitrary so use a value that is easy to debug.
+ lsrToIndex.apply(lsr("skip", "script", ""));
+
+ // Build the Trie of the LSR table data.
+ Trie trie = writeLsrTable(lsrTable, lsrToIndex);
+
+ // Note: Using XLikelySubtags as a fairly "dumb" container for the return values
+ // requires us to do slightly awkward things like passing mutable arrays around, but
+ // the advantage it has is that this data structure is also what's used in client code,
+ // so if the likely subtags data changes, it will be a forcing function to change this
+ // code.
+ return new XLikelySubtags.Data(
+ languageAliases.getCanonicalMap(),
+ regionAliases.getCanonicalMap(),
+ trie.toByteArray(),
+ lsrToIndex.getValues().toArray(new LSR[0]));
+ }
+
+ private static Trie writeLsrTable(
+ Map<String, Map<String, Map<String, LSR>>> languages,
+ Indexer<LSR, Integer> lsrToIndex) {
+
+ Trie trie = new Trie();
+ Trie.Span rootSpan = trie.root();
+ languages.forEach(
+ (language, scripts) -> rootSpan.with(
+ language,
+ span -> writeScripts(span, scripts, lsrToIndex)));
+ return trie;
+ }
+
+ private static void writeScripts(
+ Trie.Span languageSpan, Map<String, Map<String, LSR>> scripts, Indexer<LSR, Integer> lsrToIndex) {
+ checkArgument(!scripts.isEmpty(), "invalid script table: %s", scripts);
+ // If we only have '*' for scripts, but there is more than one region then we can prune
+ // the Trie at the script level and just write "<language><region>:<value>". However in
+ // order to let the lookup code know that it should not expect a script prefix for the
+ // following entries, we must add the special "skip" value before writing the regions.
+ //
+ // However if there is also only one region, we can just write "<language>:<value>" and
+ // must avoid adding the "skip" value.
+ if (scripts.size() == 1) {
+ // We already checked '*' is in every scripts table.
+ Map<String, LSR> regions = scripts.get("*");
+ if (regions.size() > 1) {
+ languageSpan.putPrefixAndValue(XLikelySubtags.SKIP_SCRIPT);
+ }
+ writeRegions(languageSpan, regions, lsrToIndex);
+ } else {
+ scripts.forEach(
+ (script, regions) -> languageSpan.with(
+ script,
+ span -> writeRegions(span, regions, lsrToIndex)));
+ }
+ }
+
+ private static void writeRegions(
+ Trie.Span languageOrScriptSpan, Map<String, LSR> regions, Indexer<LSR, Integer> lsrToIndex) {
+ checkArgument(!regions.isEmpty(), "invalid region table: %s", regions);
+ // Prune anything ending with '*' (either <language-*-*> or <language-script-*>)
+ // by writing the value immediately and omitting the '*' from the Trie.
+ if (regions.size() == 1) {
+ // We already checked '*' is in every region table.
+ languageOrScriptSpan.putPrefixAndValue(lsrToIndex.apply(regions.get("*")));
+ } else {
+ regions.forEach(
+ (region, lsr) -> languageOrScriptSpan.with(
+ region,
+ span -> span.putPrefixAndValue(lsrToIndex.apply(lsr))));
+ }
+ }
+
+ private static Map<String, Map<String, Map<String, LSR>>> makeTable(
+ Aliases languageAliases, Aliases regionAliases, CldrData supplementalData) {
+
+ Map<String, Map<String, Map<String, LSR>>> lsrTable = new TreeMap<>(LSR_TABLE_ORDER);
+
+ // set the base data
+ supplementalData.accept(DTD, v -> {
+ CldrPath path = v.getPath();
+ if (LIKELY_SUBTAG.matches(path)) {
+ // Add the canonical subtag mapping.
+ LSR source = lsrFromLocaleID(v.get(SUBTAG_FROM));
+ LSR target = lsrFromLocaleID(v.get(SUBTAG_TO));
+ set(lsrTable, source, target);
+
+ // Add all combinations of language and region aliases. This lets the
+ // matcher process aliases in locales in a single step.
+ for (String languageAlias : languageAliases.getAliases(source.language)) {
+ for (String regionAlias : regionAliases.getAliases(source.region)) {
+ if (languageAlias.equals(source.language) && regionAlias.equals(source.region)) {
+ continue;
+ }
+ set(lsrTable, languageAlias, source.script, regionAlias, target);
+ }
+ }
+ }
+ });
+
+ // Add the special case for "und-Latn" => "en-Latn-US" (which is a bit of a
+ // hack for language matching).
+ // TODO: Find out the history of this line and document it better.
+ set(lsrTable, "und", "Latn", "", lsr("en", "Latn", "US"));
+ logger.fine(lsrTable::toString);
+
+ // Ensure that if "und-RR" => "ll-Ssss-RR", then we also add "Ssss" => "RR".
+ // For example, given:
+ // <likelySubtag from="und_GH" to="ak_Latn_GH"/>
+ // we add an additional mapping for "und-Latn-GH" => "ak-Latn-GH" since there
+ // will be cases where the language subtag is just missing in data, but given
+ // the script and region we can at least make a best guess.
+ //
+ // Note: We can't move this code after the checks below because it might add
+ // more mappings which then need to be checked. However realistically, the only
+ // time the mapping "*" -> "*" would not appear is if the likely subtag data was
+ // completely broken (since it implies no region-only mappings).
+ checkState(lsrTable.containsKey("*") && lsrTable.get("*").containsKey("*"),
+ "missing likely subtag data (no default region mappings): %s", lsrTable);
+ lsrTable.get("*").get("*").forEach((key, lsr) -> set(lsrTable, "und", lsr.script, lsr.region, lsr));
+
+ // Check that every level has "*" (mapped from "und" or "").
+ lsrTable.forEach((lang, scripts) -> {
+ checkArgument(scripts.containsKey("*"), "missing likely subtag mapping for: %s", asLocale(lang));
+ scripts.forEach(
+ (script, regions) -> checkArgument(regions.containsKey("*"),
+ "missing likely subtag mapping for: %s", asLocale(lang, script)));
+ });
+ return lsrTable;
+ }
+
+ // Converts subtable key sequence into original locale ID (for debugging).
+ // asLocale("*", *", "GB") -> "und_GB"
+ private static String asLocale(String... parts) {
+ return String.format("%s%s%s",
+ !parts[0].equals("*") ? parts[0] : "und",
+ parts.length > 1 && !parts[1].equals("*") ? "_" + parts[1] : "",
+ parts.length > 2 && !parts[2].equals("*") ? "_" + parts[2] : "");
+ }
+
+ private static void set(
+ Map<String, Map<String, Map<String, LSR>>> langTable, LSR key, LSR newValue) {
+ set(langTable, key.language, key.script, key.region, newValue);
+ }
+
+ private static void set(Map<String, Map<String, Map<String, LSR>>> langTable,
+ String language, String script, String region, LSR lsr) {
+ Map<String, Map<String, LSR>> scriptTable = getSubtable(langTable, subtagOrStar(language));
+ Map<String, LSR> regionTable = getSubtable(scriptTable, subtagOrStar(script));
+ regionTable.put(subtagOrStar(region), lsr);
+ }
+
+ private static <T> Map<String, T> getSubtable(Map<String, Map<String, T>> table, String subtag) {
+ return table.computeIfAbsent(subtag, k -> new TreeMap<>(SUBTABLE_ORDER));
+ }
+
+ private static String subtagOrStar(String s) {
+ checkArgument(!s.equals("*"), "language subtags should not be '*'");
+ return s.equals("und") || s.isEmpty() ? "*" : s;
+ }
+
+ // Parses simple locale IDs in the <likelySubtags> data, not arbitrary language tags.
+ private static LSR lsrFromLocaleID(String languageIdentifier) {
+ Matcher m = LOCALE_ID.matcher(languageIdentifier);
+ checkArgument(m.matches(), "invalid language identifier: %s", languageIdentifier);
+ return lsr(m.group(1), m.group(2), m.group(3));
+ }
+
+ // Lenient factory method which accepts null for missing script or region (but not language).
+ private static LSR lsr(String language, String script, String region) {
+ return new LSR(checkNotNull(language), nullToEmpty(script), nullToEmpty(region), LSR.DONT_CARE_FLAGS);
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static java.util.Arrays.asList;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+import org.unicode.cldr.api.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.DebugWriter;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.primitives.Bytes;
+import com.ibm.icu.impl.locale.LSR;
+import com.ibm.icu.impl.locale.LocaleDistance;
+import com.ibm.icu.impl.locale.XLikelySubtags;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Mapper for generating locale distance tables from CLDR language data.
+ *
+ * <p>Note that this is an atypical mapper which does a lot more processing than other
+ * ICU mapper classes and relies on several auxilliary classes (which is why it's in a
+ * different package). Conceptually it's still a "mapper" though, just not a simple one.
+ *
+ * <p>This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project.
+ */
+public final class LocaleDistanceMapper {
+ private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName());
+
+ // All the language matching data comes from the "written_new" language data in
+ // "common/supplemental/languageInfo.xml".
+ private static final PathMatcher WRITTEN_LANGUAGE_PREFIX =
+ PathMatcher.of("//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]");
+
+ // Definitions of region containment variables used when expressing match distances. E.g.:
+ // <matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/>
+ private static final PathMatcher VARIABLE_PATH =
+ WRITTEN_LANGUAGE_PREFIX.withSuffix("matchVariable[@id=*]");
+ private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id");
+ private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value");
+
+ // Language distance data, including wildcards and variable references (possibly negated). E.g.:
+ // <languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/>
+ // <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
+ // <languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/>
+ private static final PathMatcher LANGUAGE_MATCH_PATH =
+ WRITTEN_LANGUAGE_PREFIX.withSuffix("languageMatch[@desired=*][@supported=*]");
+ private static final AttributeKey MATCH_DESIRED =
+ AttributeKey.keyOf("languageMatch", "desired");
+ private static final AttributeKey MATCH_SUPPORTED =
+ AttributeKey.keyOf("languageMatch", "supported");
+ private static final AttributeKey MATCH_DISTANCE =
+ AttributeKey.keyOf("languageMatch", "distance");
+ // Optional, assume false if not present.
+ private static final AttributeKey MATCH_ONEWAY =
+ AttributeKey.keyOf("languageMatch", "oneway");
+
+ // Singleton element containing the list of special case "paradigm" locales, which should
+ // always be preferred if there is a tie. E.g.:
+ // <paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/>
+ //
+ // Since there are no distinguishing attributes for this path, there can only be one
+ // instance which we can just lookup directly.
+ private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath(
+ "//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]/paradigmLocales");
+ private static final AttributeKey PARADIGM_LOCALES =
+ AttributeKey.keyOf("paradigmLocales", "locales");
+
+ // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", "").
+ private static final Splitter LIST_SPLITTER =
+ Splitter.on(' ').trimResults().omitEmptyStrings();
+
+ // Output resource bundle paths, split into two basic groups for likely locale mappings
+ // and match data.
+ private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases");
+ private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases");
+ private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin");
+ private static final RbPath LIKELY_LSRS = RbPath.of("likely", "lsrs");
+
+ private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin");
+ private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin");
+ private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions");
+ private static final RbPath MATCH_PARADIGMS = RbPath.of("match", "paradigms");
+ private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector");
+
+ // To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS").
+ private static final Splitter UNDERSCORE = Splitter.on('_');
+
+ /**
+ * Processes data from the given supplier to generate locale matcher ICU data.
+ *
+ * @param src the CLDR data supplier to process.
+ * @return the IcuData instance to be written to a file.
+ */
+ public static IcuData process(CldrDataSupplier src) {
+ return process(src.getDataForType(SUPPLEMENTAL));
+ }
+
+ @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
+ static IcuData process(CldrData data) {
+ IcuData icuData = new IcuData("langInfo", false);
+
+ XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data);
+ icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases));
+ icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases));
+ icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie));
+ icuData.add(LIKELY_LSRS, ofLsrs(asList(likelyData.lsrs)));
+
+ LocaleDistance.Data distanceData = buildDistanceData(data);
+ icuData.add(MATCH_TRIE, ofBytes(distanceData.trie));
+ icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex));
+ icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays));
+ icuData.add(MATCH_PARADIGMS, ofLsrs(distanceData.paradigmLSRs));
+ icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString)));
+ return icuData;
+ }
+
+ /**
+ * A simple holder for language, script and region which allows for wildcards (i.e. "*")
+ * and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional
+ * validation is done on incoming fields as data is assumed to be correct.
+ */
+ private static final class LsrSpec {
+ /**
+ * Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb"
+ * or "en_*_GB") into a structured spec. Note that if the specification string
+ * contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in
+ * the given RegionMapper builder, so the returned {@code LsrSpec} will be
+ * {@code "en_*_$GB"}.
+ */
+ public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) {
+ List<String> parts = UNDERSCORE.splitToList(rawSpec);
+ checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec);
+ String language = parts.get(0);
+ Optional<String> script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty();
+ // While parsing the region part, ensure any "bare" region subtags are converted
+ // to variables (e.g. "GB" -> "$GB") and registered with the parition map.
+ Optional<String> region =
+ parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty();
+ return new LsrSpec(language, script, region);
+ }
+
+ // A language subtag (e.g. "en") or "*".
+ private final String language;
+ // If present, a script subtag (e.g. "Latn") or "*".
+ private final Optional<String> script;
+ // If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*".
+ private final Optional<String> regionVariable;
+
+ private LsrSpec(String language, Optional<String> script, Optional<String> regionVariable) {
+ this.language = language;
+ this.script = script;
+ this.regionVariable = regionVariable;
+ // Implementation shortcuts assume:
+ // - If the language subtags are '*', the other-level subtags must also be '*' (if present).
+ // If there are rules that do not fit these constraints, we need to revise the implementation.
+ if (isAny(language)) {
+ script.ifPresent(
+ s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script));
+ regionVariable.ifPresent(
+ r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable));
+ }
+ }
+
+ public String getLanguage() {
+ return language;
+ }
+
+ public String getScript() {
+ return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this));
+ }
+
+ public String getRegionVariable() {
+ return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this));
+ }
+
+ public int size() {
+ return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1;
+ }
+
+ @Override
+ public String toString() {
+ return language + script.map(s -> "_" + s).orElse("") + regionVariable.map(r -> "_" + r).orElse("");
+ }
+ }
+
+ /**
+ * Represents a {@code <languageMatch>} rule derived from supplemental data, such as:
+ * <pre>{@code
+ * <languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/>
+ * }</pre>
+ * or:
+ * <pre>{@code
+ * <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/>
+ * }</pre>
+ *
+ * <p>The job of a {@code Rule} is to provide a mechanism for capturing the data in
+ * {@code <languageMatch>} elements and subsequently adding that information to a
+ * {@link DistanceTable.Builder} in a structured way.
+ */
+ private static final class LanguageMatchRule {
+ private final LsrSpec desired;
+ private final LsrSpec supported;
+ private final int distance;
+ private final boolean oneway;
+
+ public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) {
+ this.desired = checkNotNull(desired);
+ this.supported = checkNotNull(supported);
+ this.distance = distance;
+ this.oneway = oneway;
+ // Implementation shortcuts assume:
+ // - At any level, either both or neither spec subtags are *.
+ // If there are rules that do not fit these constraints, we need to revise the implementation.
+ checkArgument(desired.size() == supported.size(),
+ "mismatched rule specifications in: %s, %s", desired, supported);
+ checkArgument(isAny(desired.language) == isAny(supported.language),
+ "wildcard mismatch for languages in: %s, %s", desired, supported);
+ checkArgument(isAny(desired.script) == isAny(supported.script),
+ "wildcard mismatch for scripts in: %s, %s", desired, supported);
+ checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable),
+ "wildcard mismatch for languages in: %s, %s", desired, supported);
+ }
+
+ int size() {
+ return desired.size();
+ }
+
+ boolean isDefaultRule() {
+ // We already know that in LsrSpec, if the language is "*" then all subtags are too.
+ return isAny(desired.language);
+ }
+
+ /**
+ * Adds this rule to the given distance table, using the given partition map to
+ * resolve any region variables present in the desired or supported specs.
+ */
+ void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) {
+ // Note that rather than using the rule's "size" to mediate the different
+ // cases, we could have had 3 distinct sub-types of a common rule API (e.g.
+ // "LanguageRule", "ScriptRule" and "RegionRule"), each with a different
+ // addTo() callback. However this would have been quite a lot more code
+ // for not much real gain.
+ switch (size()) {
+ case 1: // Language only.
+ distanceTable.addDistance(distance, oneway,
+ desired.getLanguage(), supported.getLanguage());
+ break;
+
+ case 2: // Language and script present.
+ distanceTable.addDistance(distance, oneway,
+ desired.getLanguage(), supported.getLanguage(),
+ desired.getScript(), supported.getScript());
+ break;
+
+ case 3: // Language, script and region variable present.
+ // Add the rule distance for every combination of desired/supported
+ // partition IDs for the region variables. This is important for
+ // variables like "$americas" which overlap with multiple paritions.
+ //
+ // Note that in this case (because region variables map to sets of
+ // partition IDs) we can get situations where "shouldReverse" is true,
+ // but the desired/supported pairs being passed in are identical (e.g.
+ // different region variables map to distinct partition groups which
+ // share some common elements).
+ //
+ // This is fine, providing that the distance table is going to ignore
+ // identical mappings (which it does). Alternatively we could just
+ // re-calculate "shouldReverse" inside this loop to account for partition
+ // IDs rather than region variables.
+ ImmutableSet<String> desiredPartitionIds =
+ partitions.getPartitionIds(desired.getRegionVariable());
+ ImmutableSet<String> supportedPartitionIds =
+ partitions.getPartitionIds(supported.getRegionVariable());
+ for (String desiredPartitionId : desiredPartitionIds) {
+ for (String supportedPartitionId : supportedPartitionIds) {
+ distanceTable.addDistance(distance, oneway,
+ desired.getLanguage(), supported.getLanguage(),
+ desired.getScript(), supported.getScript(),
+ desiredPartitionId, supportedPartitionId);
+ }
+ }
+ break;
+
+ default:
+ throw new IllegalStateException("invalid size for LsrSpec: " + this);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return String.format(
+ "Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }",
+ desired, supported, distance, oneway);
+ }
+ }
+
+ private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) {
+ // Resolve any explicitly declared region variables into the partition map.
+ // Territory containment information is used to recursively resolve region
+ // variables (e.g. "$enUS") into a collection of non-macro regions.
+ PartitionInfo.Builder partitionBuilder =
+ PartitionInfo.builder(TerritoryContainment.getContainment(supplementalData));
+ supplementalData.accept(DTD, v -> {
+ CldrPath path = v.getPath();
+ if (VARIABLE_PATH.matches(path)) {
+ partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE));
+ }
+ });
+
+ // Parse the rules from <languageMatch> elements. Note that the <languageMatch>
+ // element is marked as "ORDERED" in the DTD, which means the elements always
+ // appear in the same order is in the CLDR XML file (even when using DTD order).
+ //
+ // This is one of the relatively rare situations in which using DTD order will
+ // not isolate the ICU data from reordering of the CLDR data. In particular this
+ // matters when specifying language matcher preferences (such as "en_*_GB" vs
+ // "en_*_!enUS").
+ //
+ // We could almost process the rules while reading them from the source data, but
+ // rules may contain region codes rather than variables, and we need to create a
+ // variable for each such region code before the RegionMapper is built, and
+ // before processing the rules (this happens when the LsrSpec is parsed).
+ List<LanguageMatchRule> rules = new ArrayList<>();
+ supplementalData.accept(DTD, v -> {
+ CldrPath path = v.getPath();
+ if (LANGUAGE_MATCH_PATH.matches(path)) {
+ int distance = Integer.parseInt(v.get(MATCH_DISTANCE));
+ // Lenient against there being no "oneway" attribute.
+ boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY));
+ LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder);
+ LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder);
+ LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway);
+ logger.fine(() -> String.format("rule: %s", rule));
+ rules.add(rule);
+ }
+ });
+ // Check that the rules are in the expected order. Rule order is important in ensuring
+ // data correctness and incorrect order may violate business logic assumptions later.
+ // TODO: Consider what other ordering/sanity checks make sense here.
+ for (int n = 0, prevSize = 1; n < rules.size(); n++) {
+ LanguageMatchRule rule = rules.get(n);
+ checkArgument(rule.size() >= prevSize, "<languageMatch> elements out of order at: %s", rule);
+ checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()),
+ "missing default rule before: %s", rule);
+ prevSize = rule.size();
+ }
+ checkState(rules.stream().distinct().count() == rules.size(), "duplicated rule in: %s", rules);
+
+ // Build region partition data after all the variables have been accounted for
+ // (including the implicit variables found while processing LsrSpecs).
+ PartitionInfo partitions = partitionBuilder.build();
+
+ // Add all the rules (in order) to the distance table.
+ DistanceTable.Builder distanceTableBuilder = DistanceTable.builder();
+ rules.forEach(r -> r.addTo(distanceTableBuilder, partitions));
+ DistanceTable distanceTable = distanceTableBuilder.build();
+
+ // Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values
+ // requires us to do slightly awkward things, like passing mutable arrays and LSR
+ // instances around, but the advantage it has is that this data structure is also what's
+ // used in client code, so if the likely subtags data changes, it will be a forcing
+ // function to change this code.
+ return new LocaleDistance.Data(
+ distanceTable.getTrie().toByteArray(),
+ partitions.getPartitionLookupArray(),
+ partitions.getPartitionStrings(),
+ getParadigmLsrs(supplementalData),
+ distanceTable.getDefaultDistances());
+ }
+
+ private static Set<LSR> getParadigmLsrs(CldrData supplementalData) {
+ // LinkedHashSet for stable order; otherwise a unit test is flaky.
+ CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH);
+ checkState(cldrValue != null,
+ "<paradigmLocales> element was missing: %s", PARADIGM_LOCALES_PATH);
+ String localesList = cldrValue.get(PARADIGM_LOCALES);
+ checkState(localesList != null,
+ "<paradigmLocales> 'locales' attribute was missing: %s", cldrValue);
+
+ Set<LSR> paradigmLSRs = new LinkedHashSet<>();
+ for (String paradigm : LIST_SPLITTER.split(localesList)) {
+ LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm));
+ // Clear the LSR flags to make the data equality test in LocaleDistanceTest happy.
+ paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
+ }
+ checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs);
+ return paradigmLSRs;
+ }
+
+ // Returns an RbValue serialized from a map as a sequence of alternating (key, value)
+ // pairs (formatted as one pair per line in the IcuData file).
+ //
+ // E.g.
+ // foo{
+ // key1, value1,
+ // ...
+ // keyN, valueN,
+ // }
+ private static RbValue ofMapEntries(Map<String, String> map) {
+ return RbValue.of(
+ map.entrySet().stream()
+ .flatMap(e -> Stream.of(e.getKey(), e.getValue()))
+ .collect(Collectors.toList()))
+ .elementsPerLine(2);
+ }
+
+ // Returns an RbValue serialized from a sequence of LSR instance as a sequence of repeating
+ // (language, region, script) tuples (formatted as one tuple per line in the IcuData file).
+ //
+ // E.g.
+ // foo{
+ // lang1, script1, region1,
+ // ...
+ // langN, scriptN, regionN,
+ // }
+ private static RbValue ofLsrs(Collection<LSR> lsrs) {
+ return RbValue.of(
+ lsrs.stream()
+ .flatMap(lsr -> Stream.of(lsr.language, lsr.script, lsr.region))
+ .collect(Collectors.toList()))
+ .elementsPerLine(3);
+ }
+
+ // Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of
+ // hex values. This is intended only for RbPaths using the ":bin" suffix.
+ //
+ // E.g.
+ // foo{
+ // 0123456789abcdef0123456789abcdef
+ // ...
+ // 1c0de4c0ffee
+ // }
+ //
+ // Note that typically no indentation is used when writting this binary "blob".
+ private static RbValue ofBytes(byte[] data) {
+ ImmutableList.Builder<String> hexValues = ImmutableList.builder();
+ List<Byte> bytes = Bytes.asList(data);
+ for (List<Byte> line : Iterables.partition(bytes, 16)) {
+ hexValues.add(line.stream().map(b -> String.format("%02x", b)).collect(Collectors.joining()));
+ }
+ return RbValue.of(hexValues.build());
+ }
+
+ // Returns if the subtag is the '*' wildcard. This is not to be confused with the
+ // "ANY" character used in DistanceTable.
+ private static boolean isAny(String subtag) {
+ return subtag.equals("*");
+ }
+
+ // Returns if the subtag exists and is the '*' wildcard.
+ private static boolean isAny(Optional<String> subtag) {
+ return subtag.map(LocaleDistanceMapper::isAny).orElse(false);
+ }
+
+ // Main method for running this mapper directly with logging enabled.
+ // CLDR_DIR is picked up from system properties or envirnment variables.
+ // Arguments: <output-file> [<log-level>]
+ public static void main(String[] args) throws IOException {
+ DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process);
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.logging.Logger;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.google.common.collect.SetMultimap;
+import com.google.common.collect.Sets;
+import com.google.common.collect.SortedSetMultimap;
+import com.google.common.collect.TreeMultimap;
+import com.ibm.icu.impl.locale.LSR;
+
+/**
+ * Provides mapping arrays to quickly lookup partition information for any region
+ * code in client libraries.
+ *
+ * <p>A region's partition is defined by the set of region variables (e.g. "$enUS")
+ * in the CLDR data. Each unique combination of variables forms a partition, and
+ * groups of partitions uniquely define language distance groupings. In slightly
+ * mathematical terms, partition groups form an "equivalence class" for regions
+ * with respect to language distance.
+ *
+ * <p>So by determining the minimum set of partitions and partition groups, and
+ * assigning short IDs to them, it's possibe to create data structures which
+ * support all region pairings while being small and fast to access in client code.
+ */
+final class PartitionInfo {
+ private static final Logger logger = Logger.getLogger(PartitionInfo.class.getName());
+
+ /**
+ * A builder, to which region variables are added in order to define partitions
+ * and partition groups based on territory containment.
+ */
+ static final class Builder {
+ // Possible operations to parse from a region expression (e.g. "US+005-BR").
+ private static final CharMatcher REGION_OPS = CharMatcher.anyOf("+-");
+
+ private final TerritoryContainment territories;
+ private final Set<String> variables = new HashSet<>();
+ private final SortedSetMultimap<String, String> regionToVariables = TreeMultimap.create();
+
+ private Builder(TerritoryContainment territories) {
+ this.territories = territories;
+ }
+
+ // Returns whether the given string is a known variable or the wildcard token.
+ // Non variable strings (e.g. plain region codes) can be passed in and simply
+ // return false.
+ private boolean isKnownVariableOrWildcard(String variable) {
+ return variables.contains(variable) || variable.equals("*");
+ }
+
+ /**
+ * Adds a variable expression (e.g. "$foo = "US+005-BR") from CLDR data and
+ * fully resolves all macro regions to their contained leaf regions.
+ *
+ * <p>The syntax is simple for now:
+ * <pre>
+ * regionSet := region ([-+] region)*
+ * </pre>
+ * There is no precedence, so "x+y-y+z" is "(((x+y)-y)+z)", and <em>not</em>
+ * "(x+y)-(y+z)".
+ */
+ public void addVariableExpression(String variable, String expr) {
+ checkState(variable.startsWith("$") && !variable.startsWith("$!"),
+ "invalid variable: %s", variable);
+ checkState(!isKnownVariableOrWildcard(variable),
+ "duplicate variable: %s", variable);
+ // Parsing also flattens the list to the corresponding leaf regions,
+ // so there should be no macro regions here.
+ Set<String> regions = parseAndFlattenRegionExpression(expr, territories);
+ // Add the mappings ("$foo" -> X) and the inverse ("$!foo" -> not(X)).
+ //
+ // The reason that the inverse mapping is needed is because some rules use
+ // the negated form of a variable (e.g. "$!enUS") and we must be able to
+ // resolve the set of associated partition IDs for it.
+ //
+ // If we only wanted the set of regions for the negated variable, that
+ // would be trivial (and there would be no need to store the negated values)
+ // but because the set of partition IDs for a negated variable is NOT always
+ // the negated set of parition IDs for the original variable (due to the way
+ // partitions overlap) it's not straightforward.
+ //
+ // In other words:
+ // regions-for("$!foo") == !regions-for("$foo))
+ // but:
+ // partition-ids-for("$!foo") != !partition-ids-for("$foo")
+ addVariable(variable, regions);
+ addVariable(
+ "$!" + variable.substring(1),
+ Sets.difference(territories.getLeafRegions(), regions));
+ }
+
+ private void addVariable(String variable, Iterable<String> regions) {
+ checkArgument(variables.add(variable),
+ "variable '%s' already present in: %s", variable, regions);
+ for (String region : regions) {
+ checkArgument(!region.isEmpty(), "%s", regions);
+ regionToVariables.put(region, variable);
+ }
+ }
+
+ // Parses a region expression (e.g. "US+005-BR") to a set of resolved "leaf"
+ // regions.
+ private static Set<String> parseAndFlattenRegionExpression(
+ String expr, TerritoryContainment territories) {
+ Set<String> regions = new TreeSet<>();
+ Consumer<String> operation = regions::add;
+ int last = 0;
+ for (int i = REGION_OPS.indexIn(expr); i != -1; i = REGION_OPS.indexIn(expr, last)) {
+ applyOperation(operation, expr.substring(last, i), territories);
+ // Set up the next operation based on the separator char ('+' or '-').
+ operation = (expr.charAt(i) == '+') ? regions::add : regions::remove;
+ last = i + 1;
+ }
+ applyOperation(operation, expr.substring(last), territories);
+ return regions;
+ }
+
+ private static void applyOperation(
+ Consumer<String> operation, String region, TerritoryContainment territories) {
+ checkArgument(!region.isEmpty(), "invalid region expresson (missing region)");
+ ImmutableSet<String> contained = territories.getLeafRegionsOf(region);
+ if (!contained.isEmpty()) {
+ // For macro regions, add all their contained leaf regions (direct or indirect).
+ contained.forEach(operation);
+ } else {
+ // Leaf regions are just added directly.
+ operation.accept(region);
+ }
+ }
+
+ /**
+ * Registers an implicit variable defined by a region code, and returns the new variable
+ * name.
+ *
+ * <p>This method exists because the {@code <languageMatch>} syntax supports referencing
+ * regions directly, rather than just as pre-defined variables (e.g. "en_*_GB"). We still
+ * want to track these variables however since they may interact with macro-regions.
+ *
+ * @param regionOrVariable a region or an existing variable reference.
+ * @return the name of the registered variable (including '$' prefix).
+ */
+ public String ensureVariable(String regionOrVariable) {
+ if (isKnownVariableOrWildcard(regionOrVariable)) {
+ return regionOrVariable;
+ }
+ // Here we either have a "raw" region (e.g. "GB") or an unknown variable (e.g. "$foo").
+ // However all explicit variables should have already been registered, so if this does
+ // start with '$', then it's an error.
+ checkArgument(!regionOrVariable.startsWith("$"), "unregistered variable: %s", regionOrVariable);
+
+ // This is an implicit variable, referenced by its region code, so we know that it
+ // can never be referenced in the negated form (i.e. "$!GB"), so we don't need to add
+ // the inverse mapping in the same way we do for explicitly defined variables.
+ //
+ // We also allow implicit variables to appear more than once in the list of match
+ // rules, so don't call addVariable() here, since that prohibits repeated addition.
+ // Since 'regionToVariables' is a _set_ multimap, adding implicit variables is an
+ // idempotent operation, so it's okay if it's done more than once.
+ String variable = "$" + regionOrVariable;
+ variables.add(variable);
+ regionToVariables.put(regionOrVariable, variable);
+ return variable;
+ }
+
+ public PartitionInfo build() {
+ // Step 1: Map regions to a unique "partition" ID.
+ //
+ // A region's partition is the set of variables which include it, and
+ // variables can be explicit (e.g. "$enUS"), implicit (e.g. "$GB") or
+ // negated (e.g. "$!enUS).
+ //
+ // For example, region "US" is included in the variables "$americas" and
+ // "$enUS", but is also referenced in the "negated" variables "$!cnsar"
+ // and "$!maghreb", so the "partition" of "US" is:
+ // { $americas, $enUS, $!cnsar, $!maghreb }
+ //
+ // A partition ID is a token associated with each unique variable partition.
+ //
+ // Since other regions, such as "PR" (Puerto Rico) and "VI" (U.S. Virgin
+ // Islands), are also "in" the same partition as "US", they will share the
+ // same partition ID.
+ //
+ // However, while "CA" is also included in "$americas", it's NOT defined as
+ // an "$enUS" (American English) region, so its partition is:
+ // { $americas, $!enUS, $!cnsar, $!maghreb }
+ // and it will have a different partition ID.
+
+ // Check that the region-to-partition map covers every leaf region (this
+ // is important to ensure partitions form a disjoint covering).
+ checkArgument(regionToVariables.keySet().equals(territories.getLeafRegions()),
+ "unexpected variable grouping (should cover all leaf regions): %s",
+ regionToVariables);
+ ImmutableMap<String, String> regionToPartitionId =
+ mapLeafRegionsToPartitionIds(regionToVariables);
+ logger.fine(() -> String.format("region to partition ID: %s", regionToPartitionId));
+
+ // Step 2: Construct mappings to and from partition IDs, to group regions
+ // by the variables that define them.
+
+ // A sorted mapping from every variable ("$foo" or "$!foo") to the IDs of
+ // the partitions it exists in.
+ //
+ // For example, "$americas" exists in partitions for both "$enUS" (American
+ // English) and "$!enUS" (non-American English) regions, so will be mapped
+ // to (at least) two unique parition IDs (e.g. X & Y).
+ // "$americas" -> { X, Y }
+ ImmutableSetMultimap<String, String> variableToPartitionIds =
+ mapVariablesToPartitionIds(regionToPartitionId, regionToVariables);
+ logger.fine(() -> String.format("variable to partition IDs: %s", variableToPartitionIds));
+
+ // A sorted mapping of each macro region to the partitions it intersects
+ // with. Unlike leaf regions, macro regions can map to groups of partitions
+ // rather than just a single one.
+ //
+ // For example, the macro region "419" (Latin America) intersects with
+ // both partitions:
+ // X = {$americas, $enUS, ...} (i.e. "Americas + American English")
+ // and:
+ // Y = {$americas, $!enUS, ...} (i.e. "Americas + non-American English")
+ // so this map would contain:
+ // "419" -> { X, Y }
+ ImmutableSetMultimap<String, String> macroRegionToPartitionIds =
+ mapMacroRegionsToPartitionIds(regionToPartitionId, territories);
+
+ // Step 3: Write the sparse "region index to partition group index" lookup
+ // array. This is the fast lookup array used to go from LSR region index to
+ // the partition group IDs for that region.
+ //
+ // Note that most entries in the array are zero, since the array maps from
+ // all possible regions, not just ones which exist. This is a space/time
+ // trade-off (and the array is compressed in the ICU data files anyway).
+ byte[] partitionLookupArray = new byte[LSR.REGION_INDEX_LIMIT];
+ String[] partitionStrings = writePartitionLookupTable(
+ partitionLookupArray, regionToPartitionId, macroRegionToPartitionIds);
+
+ return new PartitionInfo(variableToPartitionIds, partitionLookupArray, partitionStrings);
+ }
+
+ private static ImmutableMap<String, String> mapLeafRegionsToPartitionIds(
+ SetMultimap<String, String> regionToVariables) {
+ // A generator for partition IDs which returns a single ASCII character for
+ // each unique partition.
+ //
+ // Partition IDs are emitted into the ICU data, so it's important they are
+ // small and compatible with the ICU data file format.
+ Function<Collection<String>, String> partitionToId =
+ Indexer.create(i -> {
+ // Must be a single 7-bit ASCII value and not '*'. This is NOT
+ // used as a numeric value anywhere and could end up being a non
+ // digit character if the number of unique partitions is > 10.
+ // As of June 2020, there are only 7 unique paritions.
+ char partitionChar = (char) ('0' + i);
+ checkState(partitionChar < 0x7f, "too many partitions: %s", i);
+ return String.valueOf(partitionChar);
+ });
+
+ // For each region, find its partition ID (based on the unique combination
+ // of variables that define it).
+ ImmutableMap.Builder<String, String> regionToId = ImmutableMap.builder();
+ regionToVariables.asMap().forEach(
+ (region, variables) -> regionToId.put(region, partitionToId.apply(variables)));
+ return regionToId.build();
+ }
+
+ private static ImmutableSetMultimap<String, String> mapVariablesToPartitionIds(
+ ImmutableMap<String, String> regionToPartitionId,
+ SortedSetMultimap<String, String> regionToVariables) {
+
+ // It's vital that this is a sorted multimap (of values as well as keys)
+ // since the values are later indexed and turned into partition strings
+ // (so stability of ID order in values is necessary).
+ SortedSetMultimap<String, String> variableToPartitionIds = TreeMultimap.create();
+ regionToVariables.asMap().forEach((region, variables) -> {
+ String partitionId = regionToPartitionId.get(region);
+ for (String variable : variables) {
+ variableToPartitionIds.put(variable, partitionId);
+ }
+ });
+ return ImmutableSetMultimap.copyOf(variableToPartitionIds);
+ }
+
+ private static ImmutableSetMultimap<String, String> mapMacroRegionsToPartitionIds(
+ ImmutableMap<String, String> regionToPartitionId,
+ TerritoryContainment territories) {
+
+ // A mapping from each unique partition ID to the regions it contains.
+ // This mapping forms a disjoint covering of all (non-macro) regions and
+ // is just the "inverse" of the initial "region to partition ID" map.
+ //
+ // For example, following the examples above where:
+ // X = {$americas, $enUS, ...}
+ // and:
+ // Y = {$americas, $!enUS, ...}
+ //
+ // We would get something like:
+ // X -> {"PR", "US", "VI", ...}
+ // Y -> {"CA", ...}
+ Map<String, Collection<String>> partitionToRegions =
+ regionToPartitionId.asMultimap().inverse().asMap();
+
+ // Each macro region can then be decomposed to a mapping to the unique set
+ // of partitions it overlaps with based on its leaf regions and the regions
+ // of all known partitions.
+ SortedSetMultimap<String, String> macroToPartitions = TreeMultimap.create();
+ for (String macro : territories.getMacroRegions()) {
+ ImmutableSet<String> leaves = territories.getLeafRegionsOf(macro);
+ partitionToRegions.forEach((partition, regions) -> {
+ if (!Collections.disjoint(leaves, regions)) {
+ macroToPartitions.put(macro, partition);
+ }
+ });
+ }
+ return ImmutableSetMultimap.copyOf(macroToPartitions);
+ }
+
+ private static String[] writePartitionLookupTable(
+ byte[] partitionLookupArray,
+ ImmutableMap<String, String> regionToPartitionId,
+ ImmutableSetMultimap<String, String> macroRegionToPartitionIds) {
+
+ // A generator for indices of partition groups, based on partition IDs.
+ //
+ // For leaf regions this generates a one-to-one mapping with the single
+ // partition ID, but macro regions can overlap multiple partitions.
+ Indexer<Collection<String>, Byte> partitionGroupIndexer =
+ Indexer.create(i -> {
+ // The partition group index must fit in a byte.
+ // For Java code simplicity, we want it to also be non-negative.
+ // As of June 2020, there are 15 partition groups.
+ checkState(i <= 0x7f, "too many partition groups: %s", i);
+ return (byte) i.intValue();
+ });
+
+ // The default value in the partition lookup array (index 0) is mapped to by
+ // any unsupported region (since "LSR.indexForRegion(<invalid region>)" is 0).
+ // We must therefore reserve a special parition group index for these cases
+ // before adding the rest of the partitions.
+ partitionGroupIndexer.apply(ImmutableSet.of("."));
+
+ // Populate the radix-based sparse index array, where each region is converted
+ // to the LSR region index (which must correspond to how regions are indexed in
+ // the client side code).
+ BiConsumer<String, Collection<String>> writePartitionIndex =
+ (region, ids) -> partitionLookupArray[LSR.indexForRegion(region)] =
+ partitionGroupIndexer.apply(ids);
+
+ // Write leaf regions first (mostly to match the original code behaviour)
+ // and then macro regions.
+ //
+ // Convert the Map<String, String> to a Map<String, Collection<String>>
+ // to match the macro regions (even though each collection is a singleton).
+ regionToPartitionId.asMultimap().asMap().forEach(writePartitionIndex);
+ macroRegionToPartitionIds.asMap().forEach(writePartitionIndex);
+
+ // Check invalid reigons will map to the special "missing partition" value.
+ checkState(partitionLookupArray[0] == 0);
+
+ // Return the unique partition groups (sets of partition IDs) as strings
+ // (as a sequence of single letter partition IDs). Leaf regions will always
+ // have a single partition ID, but macro regions can overlap with multiple
+ // partitions.
+ return partitionGroupIndexer.getValues().stream()
+ .map(ids -> String.join("", ids)).toArray(String[]::new);
+ }
+ }
+
+ /**
+ * Returns a builder to which variable mappings are added, from which partition
+ * information is derived.
+ */
+ public static Builder builder(TerritoryContainment territories) {
+ return new Builder(territories);
+ }
+
+ private final ImmutableSetMultimap<String, String> variableToPartitionIds;
+ private final byte[] partitionLookupArray;
+ private final String[] partitionStrings;
+
+ private PartitionInfo(
+ ImmutableSetMultimap<String, String> variableToPartitionIds,
+ byte[] partitionLookupArray,
+ String[] partitionStrings) {
+ this.variableToPartitionIds = ImmutableSetMultimap.copyOf(variableToPartitionIds);
+ this.partitionLookupArray = partitionLookupArray;
+ this.partitionStrings = partitionStrings;
+ }
+
+ /**
+ * Returns the set of partition IDs for the given variable, or {@code {"*"}} if the
+ * speical '*' variable was given. The returned set must be non-empty because every
+ * variable includes at least one region, and all regions map to a partition ID.
+ */
+ public ImmutableSet<String> getPartitionIds(String variable) {
+ if (variable.equals("*")) {
+ return ImmutableSet.of("*");
+ }
+ ImmutableSet<String> result = variableToPartitionIds.get(variable);
+ checkArgument(!result.isEmpty(), "variable not defined: %s", variable);
+ return result;
+ }
+
+ /** Returns the sparse lookup array from LSR region index to partition group index. */
+ public byte[] getPartitionLookupArray() {
+ return partitionLookupArray;
+ }
+
+ /**
+ * Returns the partition group lookup array from partition group index to partition
+ * ID string.
+ */
+ public String[] getPartitionStrings() {
+ return partitionStrings;
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
+
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.PathMatcher;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.google.common.collect.SetMultimap;
+import com.google.common.collect.Sets;
+import com.google.common.collect.SortedSetMultimap;
+import com.google.common.collect.TreeMultimap;
+
+/**
+ * Territory containment graph. This is built from CLDR supplemental data and
+ * represents all territories and their containment, including macro regions
+ * such as {@code "016"}. The root node of the graph is {@code "001"}.
+ */
+final class TerritoryContainment {
+ // CLDR paths for containment data.
+ private static final PathMatcher CONTAINMENT_PATH =
+ PathMatcher.of("//supplementalData/territoryContainment/group[@type=*]");
+ private static final AttributeKey TYPE = AttributeKey.keyOf("group", "type");
+ private static final AttributeKey CONTAINS = AttributeKey.keyOf("group", "contains");
+
+ // Standard CLDR list values are split by space.
+ // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", "").
+ private static final Splitter LIST_SPLITTER =
+ Splitter.on(' ').trimResults().omitEmptyStrings();
+ // The world region must be the only root in the graph.
+ private static final String WORLD = "001";
+ private static final Pattern REGION = Pattern.compile("[A-Z]{2}|[0-9]{3}");
+
+ /**
+ * Returns the territory containment information described by the given CLDR
+ * supplemental data.
+ */
+ public static TerritoryContainment getContainment(CldrData supplementalData) {
+ // Directed, acyclic containment graph. Maps each territory to its direct contents.
+ // Note that since things like deprecated regions are included here, this allows
+ // sub-regions to have more than one parent.
+ SortedSetMultimap<String, String> graph = TreeMultimap.create();
+ supplementalData.accept(CldrData.PathOrder.DTD, v -> {
+ CldrPath path = v.getPath();
+ if (CONTAINMENT_PATH.matches(path)) {
+ graph.putAll(v.get(TYPE), LIST_SPLITTER.split(v.get(CONTAINS)));
+ }
+ });
+ return new TerritoryContainment(ImmutableSetMultimap.copyOf(graph));
+ }
+
+ /** Maps each macro-region to all its leaf contents (direct and indirect). */
+ private final ImmutableSetMultimap<String, String> macroToLeafRegions;
+
+ private TerritoryContainment(ImmutableSetMultimap<String, String> graph) {
+ // Do some double checking of the CLDR data.
+ graph.values().forEach(
+ r -> checkArgument(REGION.matcher(r).matches(), "bad region '%s' in: %s", r, graph));
+ checkArgument(graph.containsKey(WORLD), "missing world region '%s'", WORLD);
+ // There should be only one "root" in the graph, so every other region should be
+ // contained by something.
+ Set<String> allContained = ImmutableSet.copyOf(graph.values());
+ Set<String> roots = ImmutableSet.copyOf(Sets.difference(graph.keySet(), allContained));
+ checkArgument(roots.equals(ImmutableSet.of(WORLD)),
+ "world region '%s' must be the only containment graph root (was %s)", WORLD, roots);
+
+ // Start with a copy of the direct containment graph (but still pass in the direct
+ // graph to avoid issues with concurrent modification).
+ // If the graph is cyclic, this step will never terminate and run out of memory
+ // (and since this is a build-time tool, that's probably fine).
+ SortedSetMultimap<String, String> resolved = TreeMultimap.create(graph);
+ resolve(WORLD, graph, resolved);
+ // For leaf regions (direct or indirect) just retain any sub-regions which don't
+ // have child regions from the resolved graph.
+ this.macroToLeafRegions = resolved.entries().stream()
+ // Only keep macro regions (leaf regions don't have child regions by definition).
+ .filter(e -> !graph.get(e.getKey()).isEmpty())
+ // Only keep the single-region e.getValue() if it is a leaf region.
+ .filter(e -> graph.get(e.getValue()).isEmpty())
+ .collect(toImmutableSetMultimap(Entry::getKey, Entry::getValue));
+ }
+
+ // Recursively resolve the region and its child regions.
+ private static Set<String> resolve(
+ String region, SetMultimap<String, String> graph, SetMultimap<String, String> resolved) {
+ graph.get(region).forEach(sub -> resolved.putAll(region, resolve(sub, graph, resolved)));
+ return resolved.get(region);
+ }
+
+ /**
+ * Returns the leaf regions contained in the given region (if the given region is a
+ * leaf region, then the empty set is returned).
+ */
+ public ImmutableSet<String> getLeafRegionsOf(String region) {
+ return macroToLeafRegions.get(region);
+ }
+
+ /** Returns all leaf regions. */
+ public ImmutableSet<String> getLeafRegions() {
+ return macroToLeafRegions.get(WORLD);
+ }
+
+ /** Returns all macro regions. */
+ public ImmutableSet<String> getMacroRegions() {
+ return macroToLeafRegions.keySet();
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+import java.nio.ByteBuffer;
+import java.util.function.Consumer;
+
+import com.ibm.icu.impl.locale.LocaleDistance;
+import com.ibm.icu.util.BytesTrieBuilder;
+
+/**
+ * Trie constructed by adding "spans" of data representing prefix
+ * sequences with mapped values.
+ *
+ * <p>When a prefix needs to be added to a Trie, a new span is created to
+ * represents the additional data. If a final value is added to a span, then
+ * the current prefix data is committed to the underlying Trie as its key.
+ *
+ * <p>Typical use might look like:
+ * <pre>{@code
+ * Trie trie = new Trie();
+ * mappedData.forEach(
+ * (prefix, subValues) -> trie.root().with(prefix, subSpan -> process(subSpan, subValues));
+ * byte[] bytes = trie.toByteArray();
+ * }
+ * }</pre>
+ * where the {@code process} method may create more sub-spans, and eventually
+ * calls {@link Span#putPrefixAndValue(int)} to commit the current sequence
+ * of prefixes and the given value to the Trie.
+ *
+ * <p>Since spans share a common buffer for prefix data, it is important
+ * that extended spans are consumed before the parent span is used again.
+ * This is one reason why the API requires a consumer to be passed when a
+ * span is extended.
+ */
+final class Trie {
+ private final BytesTrieBuilder trieBuilder = new BytesTrieBuilder();
+ private final byte[] spanBytes = new byte[24];
+
+ /**
+ * Represents a sequence of prefixes to be added to the underlying Trie
+ * when a value is specified.
+ *
+ * <p>The position of a span cannot be modified, but they are not thread
+ * safe (since they share the same underlying buffer).
+ */
+ final class Span {
+ // The index *after* the last prefix was added.
+ private final int index;
+
+ // The root span.
+ private Span() {
+ this.index = 0;
+ }
+
+ // An extended span with the given prefix included.
+ private Span(int index, String prefix) {
+ checkArgument(index >= 0, "bad index: %s", index);
+ checkState(!prefix.isEmpty(), "invalid subtag: %s", prefix);
+ checkState(index + prefix.length() <= spanBytes.length, "span too long");
+ if (prefix.equals("*")) {
+ spanBytes[index++] = '*';
+ } else {
+ checkArgument(!prefix.contains("*"), "prefix must not contain '*': %s", prefix);
+ for (int i = 0; i < prefix.length(); i++) {
+ char c = prefix.charAt(i);
+ checkArgument(c < LocaleDistance.END_OF_SUBTAG, "invalid trie character: %s", c);
+ spanBytes[index++] = (byte) c;
+ }
+ // Mark the final character as a terminator to avoid overlap matches.
+ spanBytes[index - 1] |= (byte) LocaleDistance.END_OF_SUBTAG;
+ }
+ this.index = index;
+ }
+
+ /**
+ * Extends the current span by creating a new span with the given ASCII
+ * prefix data, and passing it to the given consumer. The original span is
+ * not modified, but must not be used again until the consumer is finished.
+ *
+ * <p>The prefix string must contain only 7-bit ASCII characters.
+ */
+ public void with(String prefix, Consumer<Span> withFn) {
+ withFn.accept(new Span(index, prefix));
+ }
+
+ /**
+ * Commits the current prefix data and the given value to the underlying Trie.
+ */
+ public void putPrefixAndValue(int value) {
+ checkArgument(value >= 0, "bad trie value: %s", value);
+ checkState(index > 0, "missing prefix for value: %s", value);
+ trieBuilder.add(spanBytes, index, value);
+ }
+ }
+
+ /** Returns the root span with no current prefix data. */
+ public Span root() {
+ return new Span();
+ }
+
+ /** Serializes the underlying Trie data to a byte array (see also {@link BytesTrieBuilder}). */
+ public byte[] toByteArray() {
+ ByteBuffer buffer = trieBuilder.buildByteBuffer(BytesTrieBuilder.Option.SMALL);
+ byte[] bytes = new byte[buffer.remaining()];
+ buffer.get(bytes);
+ return bytes;
+ }
+}
final void addIcuData(IcuData icuData) {
// This subclass mostly exists to control the fact that results need to be added in one go
// to the IcuData because of how referenced paths are handled. If results could be added in
- // multiple passes, you could have confusing situations in which values has path references
- // in them but the referenced paths have not been transformed yet. Forcing the subclass to
+ // multiple passes, you could have confusing situations in which values have path references
+ // in them, but the referenced paths have not been transformed yet. Forcing the subclass to
// implement a single method to generate all results at once ensures that we control the
// lifecycle of the data and how results are processed as they are added to the IcuData.
checkState(resultsByRbPath.isEmpty(),
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.util.BytesTrie;
+
+// NOTE: Remember that here, "region" is synonymous with a "partition group ID".
+public class DistanceTableTest {
+ @Test
+ public void testSimpleMapping() {
+ DistanceTable.Builder builder = defaultTable();
+ // You need at least one non default mapping.
+ builder.addDistance(23, true, "en", "en");
+ DistanceTable table = builder.build();
+ assertThat(getTrieTable(table)).containsExactly("en-en", 23);
+ assertThat(table.getDefaultDistances()).asList().containsExactly(80, 50, 4, 4).inOrder();
+ }
+
+ @Test
+ public void testReverseMapping() {
+ DistanceTable.Builder builder = defaultTable();
+ // You need at least one non default mapping.
+ builder.addDistance(1, false, "no", "nb");
+ DistanceTable table = builder.build();
+ assertThat(getTrieTable(table))
+ .containsExactly(
+ "nb-no", 1,
+ "no-nb", 1)
+ .inOrder();
+ }
+
+ @Test
+ public void testMinRegionDistance() {
+ DistanceTable.Builder builder = defaultTable();
+ // You need at least one non default mapping.
+ builder.addDistance(2, true, "zh", "zh", "Hant", "Hant", "1", "1");
+ builder.addDistance(4, true, "zh", "zh", "Hant", "Hant", "2", "2");
+ builder.addDistance(6, true, "zh", "zh", "Hant", "Hant", "*", "*");
+ DistanceTable table = builder.build();
+ assertThat(getTrieTable(table))
+ .containsExactly(
+ // Inferred mappings for "parent" locales.
+ "zh-zh", 0, // Equal locales have zero distance.
+ "zh-zh-*-*", 50, // Default unknown script distance
+ "zh-zh-Hant-Hant", 0,
+ // Trie ordering prefers "*" mapping at the front.
+ "zh-zh-Hant-Hant-*-*", 6,
+ "zh-zh-Hant-Hant-1-1", 2,
+ "zh-zh-Hant-Hant-2-2", 4)
+ .inOrder();
+ // Minimum region distance is recorded successfully (last value).
+ assertThat(table.getDefaultDistances()).asList().containsExactly(80, 50, 4, 2).inOrder();
+ }
+
+ @Test
+ public void testSkipScript() {
+ DistanceTable.Builder builder = defaultTable();
+ // You need at least one non default mapping.
+ builder.addDistance(2, true, "en", "en", "*", "*", "1", "1");
+ builder.addDistance(4, true, "en", "en", "*", "*", "*", "*");
+ DistanceTable table = builder.build();
+ assertThat(getTrieTable(table))
+ .containsExactly(
+ // "en-en" is marked for "skip script" so the remaining "en-en-..."
+ // mappings are correctly interpretted as "language-region".
+ "en-en", 128,
+ "en-en-*-*", 4,
+ "en-en-1-1", 2)
+ .inOrder();
+ }
+
+ @Test
+ public void testFirstOneWins() {
+ DistanceTable.Builder builder = defaultTable();
+ // Duplicate mappings are only expected for "region" where different rules can
+ // produce duplicate mappings by virtue of having non-disjoint region partitions.
+ builder.addDistance(2, true, "en", "en", "*", "*", "1", "1");
+ builder.addDistance(4, true, "en", "en", "*", "*", "1", "1"); // ignored
+ builder.addDistance(6, true, "en", "en", "*", "*", "*", "*");
+ DistanceTable table = builder.build();
+ assertThat(getTrieTable(table))
+ .containsExactly(
+ "en-en", 128,
+ "en-en-*-*", 6,
+ "en-en-1-1", 2)
+ .inOrder();
+ }
+
+ @Test
+ public void testBadDistance() {
+ IllegalArgumentException e = assertThrows(
+ IllegalArgumentException.class,
+ () -> defaultTable().addDistance(123, true, "en", "fr"));
+ assertThat(e).hasMessageThat().contains("distance");
+ assertThat(e).hasMessageThat().contains("123");
+ }
+
+ @Test
+ public void testBadParameters() {
+ IllegalArgumentException e = assertThrows(
+ IllegalArgumentException.class,
+ () -> defaultTable().addDistance(1, true, "en", "en", "*"));
+ assertThat(e).hasMessageThat().contains("invalid number of arguments");
+ }
+
+ @Test
+ public void testBadKeys() {
+ IllegalArgumentException e = assertThrows(
+ IllegalArgumentException.class,
+ () -> defaultTable().addDistance(1, true, "en", "*"));
+ assertThat(e).hasMessageThat().contains("invalid mapping key");
+ assertThat(e).hasMessageThat().contains("en");
+ assertThat(e).hasMessageThat().contains("�");
+ }
+
+ private static DistanceTable.Builder defaultTable() {
+ DistanceTable.Builder table = DistanceTable.builder();
+ // Defaults (which are necessary to add, but should always be trimmed from results).
+ // The actual distances don't matter (and are copied to the distance array).
+ table.addDistance(80, false, "*", "*");
+ table.addDistance(50, false, "*", "*", "*", "*");
+ table.addDistance(4, false, "*", "*", "*", "*", "*", "*");
+ return table;
+ }
+
+ @Test
+ public void testNoDefaultLanguage() {
+ // Don't get the default table, since we need to test without defaults.
+ DistanceTable.Builder builder = DistanceTable.builder();
+ IllegalStateException e = assertThrows(IllegalStateException.class, builder::build);
+ assertThat(e).hasMessageThat().contains("missing default language");
+ }
+
+ @Test
+ public void testNoDefaultScript() {
+ // Don't get the default table, since we need to test without defaults.
+ DistanceTable.Builder builder = DistanceTable.builder();
+ builder.addDistance(80, false, "*", "*");
+ IllegalStateException e = assertThrows(IllegalStateException.class, builder::build);
+ assertThat(e).hasMessageThat().contains("missing default script");
+ }
+
+ @Test
+ public void testNoDefaultRegion() {
+ // Don't get the default table, since we need to test without defaults.
+ DistanceTable.Builder builder = DistanceTable.builder();
+ builder.addDistance(80, false, "*", "*");
+ builder.addDistance(50, false, "*", "*", "*", "*");
+ IllegalStateException e = assertThrows(IllegalStateException.class, builder::build);
+ assertThat(e).hasMessageThat().contains("missing default region");
+ }
+
+ // VisibleForTesting
+ public ImmutableMap<String, Integer> getTrieTable(DistanceTable table) {
+ // We rebuild the Trie from the byte[] data.
+ return TestData.getTrieTable(new BytesTrie(table.getTrie().toByteArray(), 0), "*-*", i -> i);
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+public class IndexerTest {
+ @Test
+ public void testSimple() {
+ Indexer<String, Integer> indexer = Indexer.create();
+ assertThat(indexer.apply("foo")).isEqualTo(0);
+ assertThat(indexer.apply("bar")).isEqualTo(1);
+ assertThat(indexer.apply("baz")).isEqualTo(2);
+ assertThat(indexer.apply("foo")).isEqualTo(0);
+ }
+
+ @Test
+ public void testWithTransform() {
+ ImmutableList<String> words = ImmutableList.of("ONE", "TWO", "THREE");
+ Indexer<String, String> indexer = Indexer.create(words::get);
+ assertThat(indexer.apply("foo")).isEqualTo("ONE");
+ assertThat(indexer.apply("bar")).isEqualTo("TWO");
+ assertThat(indexer.apply("baz")).isEqualTo("THREE");
+ assertThat(indexer.apply("foo")).isEqualTo("ONE");
+
+ }
+
+ @Test
+ public void getValues() {
+ Indexer<String, Integer> indexer = Indexer.create();
+ indexer.apply("foo");
+ indexer.apply("bar");
+ indexer.apply("baz");
+ indexer.apply("bar");
+ assertThat(indexer.getValues()).containsExactly("foo", "bar", "baz").inOrder();
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.truth.Truth.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.OVERLONG;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.lsr;
+
+import org.junit.Test;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.impl.locale.LSR;
+import com.ibm.icu.impl.locale.XLikelySubtags;
+import com.ibm.icu.util.BytesTrie;
+
+public class LikelySubtagsBuilderTest {
+
+ @Test
+ public void testLanguageAliases() {
+ XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData(
+ // Minimum mapping (or else code complains).
+ likelySubtag("und", "en_Latn_US"),
+
+ alias(LANGUAGE, DEPRECATED, "in", "id"),
+ alias(LANGUAGE, DEPRECATED, "mo", "ro"),
+ // Overlong languages are ignored.
+ alias(LANGUAGE, OVERLONG, "eng", "en"),
+ // Non-simple languages with script, region or other extensions are ignored.
+ alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"),
+ alias(LANGUAGE, LEGACY, "i-default", "en-x-i-default")));
+
+ assertThat(subtags.languageAliases).containsExactly("in", "id", "mo", "ro");
+ }
+
+ @Test
+ public void testTerritoryAliases() {
+ XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData(
+ // Minimum mapping (or else code complains).
+ likelySubtag("und", "en_Latn_US"),
+
+ // When more than one replacement exists, take the first.
+ alias(TERRITORY, DEPRECATED, "CS", "RS ME"),
+ alias(TERRITORY, DEPRECATED, "UK", "GB"),
+ // Overlong territories are ignored.
+ alias(TERRITORY, OVERLONG, "eng", "en"),
+ alias(TERRITORY, OVERLONG, "999", "ZZ")));
+
+ assertThat(subtags.regionAliases).containsExactly("CS", "RS", "UK", "GB");
+ }
+
+ @Test
+ public void testLikelySubtags() {
+ XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData(
+ likelySubtag("und", "en_Latn_US"),
+ likelySubtag("en", "en_Latn_US"),
+ likelySubtag("pt", "pt_Latn_BR"),
+ likelySubtag("und_BR", "pt_Latn_BR"),
+ likelySubtag("zh", "zh_Hans_CN"),
+ likelySubtag("zh_TW", "zh_Hant_TW"),
+ likelySubtag("zh_Hant", "zh_Hant_TW")));
+
+ assertThat(subtags.lsrs).asList()
+ .containsExactly(
+ // Special cases (these should never change).
+ lsr(""),
+ lsr("skip-script"),
+ // Locales mapped to by the likely subtags mappings (in order).
+ lsr("en-Latn-US"),
+ lsr("pt-Latn-BR"),
+ lsr("zh-Hans-CN"),
+ lsr("zh-Hant-TW"))
+ .inOrder();
+
+ // Order is by "subtag" (left-to-right) with lexicographical order of tags (other
+ // than '*' which is always sorted first).
+ // Results are mapped to their corresponding value in the LSRs list.
+ assertThat(getTrieTable(subtags))
+ .containsExactly(
+ "*-*-*", lsr("en-Latn-US"),
+ "*-*-BR", lsr("pt-Latn-BR"),
+ "*-Latn-*", lsr("en-Latn-US"),
+ "*-Latn-BR", lsr("pt-Latn-BR"),
+ "*-Latn-US", lsr("en-Latn-US"),
+ "en", lsr("en-Latn-US"),
+ "pt", lsr("pt-Latn-BR"),
+ "zh-*-*", lsr("zh-Hans-CN"),
+ "zh-*-TW", lsr("zh-Hant-TW"),
+ "zh-Hant", lsr("zh-Hant-TW"))
+ .inOrder();
+ }
+
+ private static ImmutableMap<String, LSR> getTrieTable(XLikelySubtags.Data subtags) {
+ // We rebuild the Trie from the byte[] data.
+ return TestData.getTrieTable(new BytesTrie(subtags.trie, 0), "*", i -> subtags.lsrs[i]);
+ }
+
+ private static CldrData getTestData(CldrValue... values) {
+ return CldrDataSupplier.forValues(asList(values));
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.truth.Truth.assertThat;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.MACRO;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.cldrData;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.deprecatedTerritory;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.languageMatch;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.matchVariable;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.paradigms;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGrouping;
+import static org.unicode.icu.tool.cldrtoicu.testing.IcuDataSubjectFactory.assertThat;
+
+import java.io.ByteArrayOutputStream;
+import java.util.List;
+
+import org.junit.Test;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.ibm.icu.impl.locale.LSR;
+import com.ibm.icu.util.BytesTrie;
+
+/**
+ * Higher level tests for {@link LocaleDistanceMapper} to demonstrate that CLDR values
+ * are matched and processed, and the IcuData is written as expected.
+ *
+ * <p>Most of the separate parts which make up this mapper are already tested at a
+ * lower level in the other tests in this package.
+ */
+public class LocaleDistanceMapperTest {
+ @Test
+ public void testEndToEnd() {
+ // Language match elements are ordered, so need an incrementing sort index.
+ int idx = 0;
+
+ // A representative subset of CLDR data needed to generate the locale distance.
+ // This focuses on two distinct cases:
+ // 1: American vs non-American and British English
+ // This demonstrates the way that special case mappings are handled.
+ // 2: Chinese, Simplified and Traditional
+ // This demonstrates languages with multiple scripts.
+ CldrData testData = cldrData(
+ paradigms("en", "en_GB", "es", "es_419"),
+ matchVariable("$enUS", "PR+US+VI"),
+ matchVariable("$cnsar", "HK+MO"),
+
+ // The <languageMatch> element is marked "ORDERED" in the DTD, so
+ // ordering of match rules can can affect output (when paths are
+ // otherwise equal). DTD ordering will not re-order this data.
+ languageMatch("yue", "zh", 10, true, ++idx),
+ languageMatch("*", "*", 80, false, ++idx),
+
+ languageMatch("zh_Hans", "zh_Hant", 15, true, ++idx),
+ languageMatch("zh_Hant", "zh_Hans", 19, true, ++idx),
+ languageMatch("zh_Latn", "zh_Hans", 20, true, ++idx),
+ languageMatch("*_*", "*_*", 50, false, ++idx),
+
+ languageMatch("en_*_$enUS", "en_*_$enUS", 4, false, ++idx),
+ languageMatch("en_*_$!enUS", "en_*_GB", 3, false, ++idx),
+ languageMatch("en_*_$!enUS", "en_*_$!enUS", 4, false, ++idx),
+ languageMatch("en_*_*", "en_*_*", 5, false, ++idx),
+
+ languageMatch("zh_Hant_$cnsar", "zh_Hant_$cnsar", 4, false, ++idx),
+ languageMatch("zh_Hant_$!cnsar", "zh_Hant_$!cnsar", 4, false, ++idx),
+ languageMatch("zh_Hant_*", "zh_Hant_*", 5, false, ++idx),
+ languageMatch("*_*_*", "*_*_*", 4, false, ++idx),
+
+ // NOTE: This is deliberately NOT in DTD order to demonstrate that the
+ // mapper will reorder these (putting "und" last) which means that the
+ // ICU data here is NOT affected by changes in the likely subtag order).
+ likelySubtag("und", "en_Latn_US"),
+ likelySubtag("und_HK", "zh_Hant_HK"),
+ likelySubtag("und_MO", "zh_Hant_MO"),
+ likelySubtag("und_TW", "zh_Hant_TW"),
+ likelySubtag("und_030", "zh_Hans_CN"),
+ likelySubtag("und_142", "zh_Hans_CN"),
+ likelySubtag("und_CN", "zh_Hans_CN"),
+ likelySubtag("und_Hans", "zh_Hans_CN"),
+ likelySubtag("und_Hant", "zh_Hant_TW"),
+ likelySubtag("zh", "zh_Hans_CN"),
+ likelySubtag("zh_Hant", "zh_Hant_TW"),
+ likelySubtag("zh_TW", "zh_Hant_TW"),
+
+ // NOT in DTD order (to demonstrate order invariance later).
+ alias(LANGUAGE, LEGACY, "zh_SG", "zh_Hans_SG"),
+ alias(LANGUAGE, LEGACY, "zh_HK", "zh_Hant_HK"),
+ alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"),
+ alias(LANGUAGE, LEGACY, "zh_MO", "zh_Hant_MO"),
+ alias(LANGUAGE, LEGACY, "zh_CN", "zh_Hans_CN"),
+ alias(LANGUAGE, MACRO, "cmn", "zh"),
+
+ // NOT in DTD order (to demonstrate order invariance later).
+ alias(TERRITORY, DEPRECATED, "UK", "GB"),
+ alias(TERRITORY, DEPRECATED, "AN", "CW", "SX", "BQ"),
+
+ // Rather trimmed down containment hierarchy. It still retains macro
+ // regions and grouping to demonstrate that these work as expected.
+ territoryGroup("001", "019", "142", "150"), // World
+ territoryGrouping("001", "EU"),
+ territoryGroup("019", "021", "419"), // Americas
+ territoryGroup("142", "030", "035"), // Asia
+ territoryGroup("150", "154", "155"), // Europe
+ territoryGrouping("EU", "DE", "FR", "IE"), // European Union (no CH or GB)
+ territoryGroup("021", "CA", "PM", "US"), // Northern America
+ territoryGroup("419", "013", "029"), // Latin America and the Caribbean
+ territoryGroup("030", "CN", "HK", "MO", "TW"), // Eastern Asia
+ territoryGroup("035", "PH", "SG", "TH", "VN"), // South-Eastern Asia
+ territoryGroup("154", "GB", "IE"), // Northern Europe
+ territoryGroup("155", "CH", "DE", "FR"), // Western Europe
+ territoryGroup("013", "CR", "MX", "PA"), // Central America
+ territoryGroup("029", "BQ", "CW", "PR", "SX", "VI"), // Caribbean
+ deprecatedTerritory("029", "AN")); // Antilles (=> BQ, CW, SX)
+
+ IcuData icuData = LocaleDistanceMapper.process(testData);
+ // Aliases come in (deprecated, replacement) pairs.
+ assertThat(icuData).hasValuesFor("likely/languageAliases", "cmn", "zh");
+ assertThat(icuData).hasValuesFor("likely/regionAliases", "AN", "CW", "UK", "GB");
+
+ // LSR values come in (language, script, region) tuples. They are the mapped-to
+ // values for the likely subtag mappings, ordered by the DTD order in which the
+ // mapping keys were encountered.
+ assertThat(icuData).hasValuesFor("likely/lsrs",
+ "", "", "",
+ "skip", "script", "",
+ "zh", "Hans", "CN",
+ "zh", "Hant", "TW",
+ "en", "Latn", "US",
+ "zh", "Hant", "HK",
+ "zh", "Hant", "MO");
+
+ // It's a bit easier to see how match keys are grouped against the partitions.
+ ImmutableSetMultimap<Integer, String> likelyTrie =
+ getTrieMap(icuData, "likely/trie:bin", "*").asMultimap().inverse();
+
+ // Special values in the lookup table don't map from any locales directly.
+ assertThat(likelyTrie).valuesForKey(0).isEmpty();
+ assertThat(likelyTrie).valuesForKey(1).isEmpty();
+
+ // Index 4: en-Latn-US (the general default and default for Latn).
+ assertThat(likelyTrie).valuesForKey(4).containsExactly("*-Latn-*", "*-Latn-US", "*-*-*");
+
+ // Index 2: zh-Hans-CN (default for zh, Hans and CN separately).
+ assertThat(likelyTrie).valuesForKey(2).containsExactly(
+ "*-*-030", "*-*-142", // macro regions
+ "*-*-CN", "*-Hans-*", "*-Hans-CN", // unknown language match
+ "cmn-*-*", // language alias
+ "zh-*-*"); // default for language
+
+ // Index 2: zh-Hant-TW (default for zh if Hant or TW is given).
+ assertThat(likelyTrie).valuesForKey(3).containsExactly(
+ "*-*-TW", "*-Hant-*", "*-Hant-TW", // unknown language match
+ "cmn-*-TW", "cmn-Hant", // language alias with specific script/region
+ "zh-*-TW", "zh-Hant"); // default for script/region
+
+ // Other zh languages (zh-Hant-HK, zh-Hant-MO) require an explicit region match.
+ assertThat(likelyTrie).valuesForKey(5).containsExactly("*-*-HK", "*-Hant-HK");
+ assertThat(likelyTrie).valuesForKey(6).containsExactly("*-*-MO", "*-Hant-MO");
+
+ // Pairs of expanded paradigm locales (using LSR tuples) in declaration order.
+ // This is just the list from the CLDR data with no processing.
+ assertThat(icuData).hasValuesFor("match/paradigms",
+ "en", "Latn", "US",
+ "en", "Latn", "GB",
+ "es", "Latn", "ES",
+ "es", "Latn", "419");
+
+ // See PartitionInfoTest for a description of the ordering of these strings.
+ assertThat(icuData).hasValuesFor("match/partitions",
+ ".", "0", "1", "2", "3", "0123", "03", "02", "01");
+
+ ImmutableMap<String, Integer> matchTrie = getTrieMap(icuData, "match/trie:bin", "*-*");
+ byte[] regionLookup = getBytes(icuData, "match/regionToPartitions:bin");
+ ImmutableList<String> partitions =
+ icuData.get(RbPath.parse("match/partitions")).get(0).getElements();
+
+ // Test defaults have been trimmed.
+ assertThat(matchTrie).doesNotContainKey("*-*");
+ assertThat(matchTrie).doesNotContainKey("*-*-*-*");
+ assertThat(matchTrie).doesNotContainKey("*-*-*-*-*-*");
+
+ // Some zh specific tests.
+ assertThat(matchTrie).containsEntry("yue-zh", 10); // Encapsulated language
+ assertThat(matchTrie).containsEntry("zh-zh-Hant-Hant-*-*", 5);
+
+ // Special marker that means "en-en" matches don't use script information.
+ // This is assumed in the distance tests below, so it's important to check.
+ assertThat(matchTrie).containsEntry("en-en", 128);
+
+ // British English is a slightly better match against non-American English.
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "GB", 3);
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "GB", 3);
+ // "EU" works here because while it's a macro region, in this data it only
+ // covers a single partition.
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "EU", 3);
+
+ // Pairs of non-American or American English languages get a larger distance.
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "DE", 4);
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "PR", 4);
+ // Deprecated regions (AN) are still mapped to partitions and get real distances.
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "AN", "TW", 4);
+
+ // Mixing American and non-American English gets the default "en-en-*-*" distance.
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "US", 5);
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "US", 5);
+ assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "AN", 5);
+
+ // Default distances for language, script and region, plus minimum region distance.
+ // Minimum region distance is "en_*_$!enUS" -> "en_*_GB" (as seen above).
+ assertThat(icuData).hasValuesFor("match/distances:intvector", "80", "50", "4", "3");
+ }
+
+ // Helper to make assertions about language distance a bit more readable.
+ // PartitionInfoTest includes more low level tests for precise ordering etc.
+ private static void assertEnDistanceForRegions(
+ ImmutableMap<String, Integer> matchTrie,
+ byte[] regionLookup,
+ ImmutableList<String> paritions,
+ String regionA, String regionB,
+ int distance) {
+ // Three step lookup for each region:
+ // 1: Find LSR index from region string.
+ // 2: Lookup partition group index from region lookup table.
+ // 3: Lookup partition group string from partitions table.
+ String partitionA = paritions.get(regionLookup[LSR.indexForRegion(regionA)]);
+ String partitionB = paritions.get(regionLookup[LSR.indexForRegion(regionB)]);
+
+ // For now only support cases where there's a single partition ID associated
+ // with the region (this is all non-macro regions and *some* macro regions).
+ checkArgument(partitionA.length() == 1 && partitionB.length() == 1,
+ "multiple partitions unsupported in test: %s %s", regionA, regionB);
+
+ // This is a depth 2 key because we know that "en" skips scripts. This will
+ // not work the same for "zh" because that needs scripts information.
+ String key = String.format("en-en-%s-%s", partitionA, partitionB);
+ if (matchTrie.containsKey(key)) {
+ assertThat(matchTrie).containsEntry(key, distance);
+ } else {
+ assertThat(matchTrie).containsEntry("en-en-*-*", distance);
+ }
+ }
+
+ // Returns the mapping for a Trie from a ":bin" suffixed resource value.
+ // "star" defines what the Trie wildcard should be expanded to (for readability).
+ private static ImmutableMap<String, Integer> getTrieMap(IcuData icuData, String path, String star) {
+ return TestData.getTrieTable(getTrie(icuData, path), star, i -> i);
+ }
+
+ // Reads a Trie from a ":bin" suffixed resource value.
+ private static BytesTrie getTrie(IcuData icuData, String path) {
+ return new BytesTrie(getBytes(icuData, path), 0);
+ }
+
+ // Reads a byte array from a ":bin" suffixed resource value.
+ private static byte[] getBytes(IcuData icuData, String path) {
+ RbPath rbPath = RbPath.parse(path);
+ checkArgument(rbPath.isBinPath(), "only binary paths (:bin) should have binary data: %s", path);
+ List<RbValue> rbValues = icuData.get(rbPath);
+ checkArgument(rbValues != null, "missing value for: %s", rbPath);
+ checkArgument(rbValues.size() == 1, "expect single RbValue: %s", rbValues);
+ // Take a sequence of hex-strings, convert each to a byte[] and collect them.
+ return rbValues.get(0).getElements().stream()
+ .map(LocaleDistanceMapperTest::decodeHex)
+ .collect(
+ ByteArrayOutputStream::new,
+ (out, b) -> out.write(b, 0, b.length),
+ (out, b) -> out.write(b.toByteArray(), 0, b.size()))
+ .toByteArray();
+ }
+
+ // Hex chars to byte array (2 chars per byte, little endian).
+ private static byte[] decodeHex(String s) {
+ checkArgument(s.length() % 2 == 0, "binary hex strings must have an even length: %s", s);
+ checkArgument(HEX.matchesAllOf(s), "invalid binary hex string: %s", s);
+ byte[] bytes = new byte[s.length() / 2];
+ for (int n = 0; n < bytes.length; n++) {
+ bytes[n] = (byte) Integer.parseUnsignedInt(s.substring(2 * n, 2 * (n + 1)), 16);
+ }
+ return bytes;
+ }
+
+ private static final CharMatcher HEX = CharMatcher.anyOf("0123456789abcdefABCDEF");
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.truth.Truth.assertThat;
+import static java.util.Arrays.asList;
+
+import org.junit.Test;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+
+import com.ibm.icu.impl.locale.LSR;
+
+public class PartitionInfoTest {
+ @Test
+ public void testPartitionInfo() {
+ TerritoryContainment territories = territories(
+ TestData.territoryGroup("001", "019", "150"),
+ // Americas (simplified): North America + Caribbean
+ TestData.territoryGroup("019", "003", "029"),
+ TestData.territoryGroup("003", "CA", "US"),
+ TestData.territoryGroup("029", "PR", "VI"),
+ // Sort of Europe
+ TestData.territoryGroup("150", "DE", "FR", "GB"));
+ PartitionInfo.Builder builder = PartitionInfo.builder(territories);
+ // "American English" associated with U.S.A and Puerto Rico.
+ builder.addVariableExpression("$enUS", "US+PR");
+ // The "Americas" form a different language grouping.
+ builder.addVariableExpression("$americas", "019");
+ // Also register a separate variable for just the GB region code.
+ builder.ensureVariable("GB");
+
+ // In terms of "partitions" (which are assigned in sorted region code order)
+ // we should now have:
+ //
+ // CA, VI -> { $americas, $!enUS } == "0"
+ // DE, FR -> { $!americas, $!enUS } == "1"
+ // GB -> { $!americas, $!enUS, $GB } == "2"
+ // PR, US -> { $americas, $enUS } == "3"
+ //
+ // So reversing this to map variables to the partitions they overlap with:
+ // "$enUS" -> { "3" }
+ // "$!enUS" -> { "0", "1", "2" }
+ // "$americas" -> { "0", "3" }
+ // "$!americas" -> { "1", "2" }
+ // "$GB" -> { "2" }
+ PartitionInfo info = builder.build();
+ assertThat(info.getPartitionIds("$enUS")).containsExactly("3");
+ assertThat(info.getPartitionIds("$!enUS")).containsExactly("0", "1", "2");
+ assertThat(info.getPartitionIds("$americas")).containsExactly("0", "3");
+ assertThat(info.getPartitionIds("$!americas")).containsExactly("1", "2");
+ assertThat(info.getPartitionIds("$GB")).containsExactly("2");
+
+ // Partition strings are made up of the explicit partition IDs.
+ // Indices are also assigned in first encountered region code order.
+ assertThat(info.getPartitionStrings()).asList().containsExactly(
+ // Default (unmapped) special case must be first.
+ ".", // ?? : index=0
+ // Partitions IDs for "leaf" regions (only one partition per region).
+ "0", // CA, VI : index=1
+ "1", // DE, FR : index=2
+ "2", // GB : index=3
+ "3", // PR, US : index=4
+ // Macros regions include paritions of all overlapping regions.
+ "0123", // 001 : index=5
+ "03", // 003, 019, 029 : index=6
+ "12") // 150 : index=7
+ .inOrder();
+
+ // The partition lookup array maps regions to the index of their partition string.
+ byte[] lookup = info.getPartitionLookupArray();
+ assertThat(lookup[LSR.indexForRegion("CA")]).isEqualTo(1);
+ assertThat(lookup[LSR.indexForRegion("VI")]).isEqualTo(1);
+ assertThat(lookup[LSR.indexForRegion("DE")]).isEqualTo(2);
+ assertThat(lookup[LSR.indexForRegion("FR")]).isEqualTo(2);
+ assertThat(lookup[LSR.indexForRegion("GB")]).isEqualTo(3);
+ assertThat(lookup[LSR.indexForRegion("PR")]).isEqualTo(4);
+ assertThat(lookup[LSR.indexForRegion("US")]).isEqualTo(4);
+ assertThat(lookup[LSR.indexForRegion("001")]).isEqualTo(5);
+ assertThat(lookup[LSR.indexForRegion("003")]).isEqualTo(6);
+ assertThat(lookup[LSR.indexForRegion("019")]).isEqualTo(6);
+ assertThat(lookup[LSR.indexForRegion("029")]).isEqualTo(6);
+ assertThat(lookup[LSR.indexForRegion("150")]).isEqualTo(7);
+ // Unknown regions map to index 0.
+ assertThat(lookup[LSR.indexForRegion("JP")]).isEqualTo(0);
+ }
+
+ private static TerritoryContainment territories(CldrValue... tcs) {
+ return TerritoryContainment.getContainment(CldrDataSupplier.forValues(asList(tcs)));
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.truth.Truth.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+
+import org.junit.Test;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+
+public class TerritoryContainmentTest {
+
+ @Test
+ public void testSimple() {
+ CldrData testData = getTestData(
+ territoryGroup("001", "002", "003"),
+ territoryGroup("002", "GB", "FR"),
+ territoryGroup("003", "US", "CA"));
+ TerritoryContainment containment = TerritoryContainment.getContainment(testData);
+ assertThat(containment.getMacroRegions()).containsExactly("001", "002", "003").inOrder();
+ assertThat(containment.getLeafRegions()).containsExactly("CA", "FR", "GB", "US").inOrder();
+ assertThat(containment.getLeafRegionsOf("002")).containsExactly("FR", "GB").inOrder();
+ assertThat(containment.getLeafRegionsOf("GB")).isEmpty();
+ }
+
+ @Test
+ public void testOverlappingContainment() {
+ CldrData testData = getTestData(
+ territoryGroup("001", "002", "003", "004"),
+ territoryGroup("002", "GB", "FR"),
+ territoryGroup("003", "US", "CA"),
+ territoryGroup("004", "CA", "GB"));
+ TerritoryContainment containment = TerritoryContainment.getContainment(testData);
+ assertThat(containment.getLeafRegions()).containsExactly("CA", "FR", "GB", "US").inOrder();
+ assertThat(containment.getLeafRegionsOf("002")).containsExactly("FR", "GB").inOrder();
+ assertThat(containment.getLeafRegionsOf("004")).containsExactly("CA", "GB").inOrder();
+ }
+
+ @Test
+ public void testMultipleRootsFails() {
+ CldrData testData = getTestData(
+ territoryGroup("001", "002"),
+ territoryGroup("002", "GB", "FR"),
+ territoryGroup("003", "US", "CA"));
+ IllegalArgumentException err =
+ assertThrows(IllegalArgumentException.class, () -> TerritoryContainment.getContainment(testData));
+ assertThat(err).hasMessageThat().contains("001");
+ assertThat(err).hasMessageThat().contains("003");
+ assertThat(err).hasMessageThat().doesNotContain("002");
+ }
+
+ @Test
+ public void testCyclicGraphFails() {
+ CldrData testData = getTestData(
+ territoryGroup("001", "002"),
+ territoryGroup("002", "001"));
+ IllegalArgumentException err =
+ assertThrows(IllegalArgumentException.class, () -> TerritoryContainment.getContainment(testData));
+ assertThat(err).hasMessageThat().contains("world region");
+ assertThat(err).hasMessageThat().contains("001");
+ }
+
+ private static CldrData getTestData(CldrValue... values) {
+ return CldrDataSupplier.forValues(asList(values));
+ }
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static java.util.Arrays.asList;
+
+import java.util.List;
+
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.Ascii;
+import com.google.common.base.Function;
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.impl.locale.LSR;
+import com.ibm.icu.util.BytesTrie;
+
+/**
+ * Utilities for easily generating test data for the LocaleDistanceMapper tests.
+ */
+final class TestData {
+ /**
+ * Returns an LSR from a locale ID pattern (e.g. "und", "zh-Hant", "en-*-GB").
+ * This is definitely not a general locale parser!
+ */
+ static LSR lsr(String s) {
+ List<String> parts = Splitter.on('-').splitToList(s);
+ checkArgument(parts.size() <= 3);
+ return new LSR(
+ parts.get(0),
+ parts.size() > 1 ? parts.get(1) : "",
+ parts.size() > 2 ? parts.get(2) : "",
+ LSR.DONT_CARE_FLAGS);
+ }
+
+ enum AliasType { LANGUAGE, TERRITORY }
+
+ enum AliasReason { DEPRECATED, OVERLONG, LEGACY, MACRO }
+
+ /** Returns CLDR data for the given values. */
+ static CldrData cldrData(CldrValue... values) {
+ return CldrDataSupplier.forValues(asList(values));
+ }
+
+ /** Returns a CldrValue for a {@code <paradigmLocales>} element. */
+ static CldrValue paradigms(String... values) {
+ return supplemental(
+ "languageMatching/languageMatches[@type=\"written_new\"]/"
+ + "paradigmLocales[@locales=\"%s\"]",
+ String.join(" ", values));
+ }
+
+ /** Returns a CldrValue for a {@code <matchVariable>} element. */
+ static CldrValue matchVariable(String id, String value) {
+ return supplemental(
+ "languageMatching/languageMatches[@type=\"written_new\"]/"
+ + "matchVariable[@id=\"%s\"][@value=\"%s\"]",
+ id, value);
+ }
+
+ /** Returns a CldrValue for a {@code <languageMatch>} element. */
+ static CldrValue languageMatch(
+ String desired, String supported, int distance, boolean oneway, int sort) {
+ return supplemental(
+ "languageMatching/languageMatches[@type=\"written_new\"]/"
+ + "languageMatch[@_q=\"%d\"][@desired=\"%s\"][@supported=\"%s\"][@distance=\"%d\"]%s",
+ sort, desired, supported, distance, oneway ? "[@oneway=\"true\"]" : "");
+ }
+
+ /** Returns a CldrValue for either a {@code <languageAlias>} or {@code <territoryAlias>} element. */
+ static CldrValue alias(AliasType type, AliasReason reason, String value, String... replacement) {
+ return supplemental(
+ "metadata/alias/%sAlias[@type=\"%s\"][@replacement=\"%s\"][@reason=\"%s\"]",
+ lower(type), value, String.join(" ", replacement), lower(reason));
+ }
+
+ /** Returns a CldrValue for either a {@code <likelySubtags>} element. */
+ static CldrValue likelySubtag(String from, String to) {
+ return supplemental(
+ "likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]",
+ from, to);
+ }
+
+ /** Returns a CldrValue for a {@code <territoryContainment>} group element. */
+ static CldrValue territoryGroup(String region, String... subregions) {
+ return supplemental(
+ "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"]",
+ region, String.join(" ", subregions));
+ }
+
+ /**
+ * Returns a CldrValue for a {@code <territoryContainment>} group element where
+ * {@code @status="group"}.
+ */
+ static CldrValue territoryGrouping(String region, String... subregions) {
+ return supplemental(
+ "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"][@status=\"group\"]",
+ region, String.join(" ", subregions));
+ }
+
+ /**
+ * Returns a CldrValue for a {@code <territoryContainment>} group element where
+ * {@code @status="deprecated"}.
+ */
+ static CldrValue deprecatedTerritory(String region, String... subregions) {
+ return supplemental(
+ "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"][@status=\"deprecated\"]",
+ region, String.join(" ", subregions));
+ }
+
+ /**
+ * Returns a map from expanded Trie keys to mapped value. This is useful in allowing
+ * tests to use human readable data when testing Tries.
+ *
+ * @param star a string representing the Trie wildcard in the output keys, which for
+ * readability differs between use cases (e.g. "*" for subtags and "*-*"
+ * for match rules).
+ * @param fn a function to map the actual Trie value to a more readable value for
+ * testing.
+ */
+ static <T> ImmutableMap<String, T> getTrieTable(BytesTrie trie, String star, Function<Integer, T> fn) {
+ // Mostly copied from LocaleDistance (since the necessary constructor is private).
+ // Main change is the this no longer uses a TreeMap, since we want to test order.
+ ImmutableMap.Builder<String, T> map = ImmutableMap.builder();
+ StringBuilder sb = new StringBuilder();
+ for (BytesTrie.Entry entry : trie) {
+ sb.setLength(0);
+ int length = entry.bytesLength();
+ for (int i = 0; i < length; ++i) {
+ byte b = entry.byteAt(i);
+ if (b == '*') {
+ sb.append(star).append('-');
+ } else if (b >= 0) {
+ sb.append((char) b);
+ } else { // end of subtag (high bit set)
+ sb.append((char) (b & 0x7f)).append('-');
+ }
+ }
+ assert sb.length() > 0 && sb.charAt(sb.length() - 1) == '-';
+ sb.setLength(sb.length() - 1);
+ map.put(sb.toString(), fn.apply(entry.value));
+ }
+ return map.build();
+ }
+
+ private static CldrValue supplemental(String path, Object... args) {
+ return CldrValue.parseValue(String.format("//supplementalData/" + path, args), "");
+ }
+
+ private static String lower(Enum<?> value) {
+ return Ascii.toLowerCase(value.name());
+ }
+
+ private TestData() {}
+}
--- /dev/null
+// © 2020 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package org.unicode.icu.tool.cldrtoicu.localedistance;
+
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.truth.Truth.assertThat;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.ibm.icu.util.BytesTrie;
+
+public class TrieTest {
+ @Test
+ public void testSimple() {
+ Trie trie = new Trie();
+ trie.root().with("answer", t -> t.putPrefixAndValue(42));
+ assertThat(getRawTrieTable(trie.toByteArray())).containsExactly("answer", 42);
+ }
+
+ @Test
+ public void testSubSpan() {
+ Trie trie = new Trie();
+ trie.root().with("foo", foo -> foo.with("bar", fooBar -> fooBar.putPrefixAndValue(42)));
+ assertThat(getRawTrieTable(trie.toByteArray())).containsExactly("foo-bar", 42);
+ }
+
+ @Test
+ public void testHierarchyAndOrdering() {
+ Trie trie = new Trie();
+ trie.root().with("foo", foo -> {
+ foo.with("two", sub -> sub.putPrefixAndValue(3));
+ foo.with("one", sub -> sub.putPrefixAndValue(2));
+ foo.with("*", sub -> sub.putPrefixAndValue(1));
+ });
+ trie.root().with("bar", bar -> bar.with("baz", baz -> baz.with("quux", quux -> quux.putPrefixAndValue(0))));
+
+ // Order is by "subtag" (left-to-right) with lexicographical order of tags (other
+ // than '*' which is always sorted first).
+ assertThat(getRawTrieTable(trie.toByteArray()))
+ .containsExactly(
+ "bar-baz-quux", 0,
+ "foo-*", 1,
+ "foo-one", 2,
+ "foo-two", 3)
+ .inOrder();
+ }
+
+ @Test
+ public void testStarOrdering() {
+ Trie trie = new Trie();
+ // Use '$' which has a lower byte value that '*' in ASCII, but when it terminates a prefix,
+ // it has bit-7 set which makes it sort higher than '*'.
+ // In other tests it's not clear that '*' is sorted specially since '*' < [a-z] anyway.
+ trie.root().with("$", foo -> {
+ // A single '$' sorts after '*' because '$' will have bit-7 set, and '*' will not.
+ foo.with("$", sub -> sub.putPrefixAndValue(5));
+ // '$$' sorts below * because the leading '$' won't have bit-7 set.
+ foo.with("$$", sub -> sub.putPrefixAndValue(3));
+ foo.with("*", sub -> sub.putPrefixAndValue(4));
+ });
+ trie.root().with("*", foo -> {
+ foo.with("$", sub -> sub.putPrefixAndValue(2));
+ foo.with("*", sub -> sub.putPrefixAndValue(1));
+ });
+ trie.root().with("*", sub -> sub.putPrefixAndValue(0));
+
+ // Star is definitely sorted before other entries.
+ assertThat(getRawTrieTable(trie.toByteArray()))
+ .containsExactly(
+ "*", 0,
+ "*-*", 1,
+ "*-$", 2,
+ "$-$$", 3,
+ "$-*", 4,
+ "$-$", 5)
+ .inOrder();
+ }
+
+ @Test
+ public void testBadTrie_BadValue() {
+ Trie trie = new Trie();
+ IllegalArgumentException e =
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> trie.root().with("foo", t -> t.putPrefixAndValue(-1)));
+ assertThat(e).hasMessageThat().contains("bad trie value");
+ assertThat(e).hasMessageThat().contains("-1");
+ }
+
+ @Test
+ public void testBadTrie_NoPrefix() {
+ Trie trie = new Trie();
+ IllegalStateException e =
+ assertThrows(IllegalStateException.class, () -> trie.root().putPrefixAndValue(23));
+ assertThat(e).hasMessageThat().contains("missing prefix");
+ assertThat(e).hasMessageThat().contains("23");
+ }
+
+ @Test
+ public void testBadTrie_BadPrefix() {
+ Trie trie = new Trie();
+ IllegalArgumentException e =
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> trie.root().with("ümlaut", t -> t.putPrefixAndValue(0)));
+ assertThat(e).hasMessageThat().contains("invalid trie character");
+ assertThat(e).hasMessageThat().contains("ü");
+ }
+
+ @Test
+ public void testBadTrie_NoStarInPrefix() {
+ Trie trie = new Trie();
+ IllegalArgumentException e =
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> trie.root().with("foo*bar", t -> t.putPrefixAndValue(0)));
+ assertThat(e).hasMessageThat().contains("must not contain '*'");
+ assertThat(e).hasMessageThat().contains("foo*bar");
+ }
+
+ @Test
+ public void testBadTrie_TooLong() {
+ Trie trie = new Trie();
+ IllegalStateException e =
+ assertThrows(IllegalStateException.class, () -> infiniteRecursion(trie.root()));
+ assertThat(e).hasMessageThat().contains("span too long");
+ }
+
+ private static void infiniteRecursion(Trie.Span span) {
+ span.with("!", TrieTest::infiniteRecursion);
+ }
+
+ private static ImmutableMap<String, Integer> getRawTrieTable(byte[] data) {
+ // We rebuild the Trie from the byte[] data.
+ BytesTrie trie = new BytesTrie(data, 0);
+
+ // Mostly copied from XLikelySubtags (since the necessary constructor is private).
+ // Main change is the this no longer uses a TreeMap, since we want to test order.
+ Map<String, Integer> map = new LinkedHashMap<>();
+ StringBuilder sb = new StringBuilder();
+ for (BytesTrie.Entry entry : trie) {
+ sb.setLength(0);
+ int length = entry.bytesLength();
+ for (int i = 0; i < length; i++) {
+ byte b = entry.byteAt(i);
+ sb.append((char) (b & 0x7f));
+ if (b < 0 || b == '*') {
+ // end of subtag (high bit set or special '*' wildcard)
+ sb.append("-");
+ }
+ }
+ checkState(sb.length() > 0 && sb.charAt(sb.length() - 1) == '-');
+ sb.setLength(sb.length() - 1);
+ map.put(sb.toString(), entry.value);
+ }
+ return ImmutableMap.copyOf(map);
+ }
+}