From: David Beaumont Date: Wed, 19 Aug 2020 11:05:12 +0000 (+0000) Subject: ICU-21149 Integrating LocaleDistanceBuilder functionality into LDML tooling X-Git-Tag: cldr/2020-09-22~119 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c965ff7570fa8ee209ee628133c9146a4f78722;p=icu ICU-21149 Integrating LocaleDistanceBuilder functionality into LDML tooling See #1165 --- diff --git a/tools/cldr/cldr-to-icu/build-icu-data.xml b/tools/cldr/cldr-to-icu/build-icu-data.xml index 2d36103a8d5..048b72a74ef 100644 --- a/tools/cldr/cldr-to-icu/build-icu-data.xml +++ b/tools/cldr/cldr-to-icu/build-icu-data.xml @@ -388,7 +388,6 @@ - diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java index 54de15b3cf3..93f23524c90 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java @@ -2,6 +2,7 @@ // License & terms of use: http://www.unicode.org/copyright.html package org.unicode.icu.tool.cldrtoicu; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static java.nio.charset.StandardCharsets.UTF_8; import static java.nio.file.StandardOpenOption.CREATE; @@ -132,7 +133,7 @@ final class IcuTextWriter { } private void open(String label, PrintWriter out) { - newLineAndIndent(out); + newLineAndIndent(out, FormatOptions.PATH_FORMAT); depth++; // This handles the "magic" pseudo indexing paths that are added by RegexTransformer. // These take the form of "" and are used to ensure that path order can be @@ -145,14 +146,16 @@ final class IcuTextWriter { private void close(PrintWriter out) { depth--; - newLineAndIndent(out); + newLineAndIndent(out, FormatOptions.PATH_FORMAT); out.print('}'); } - private void newLineAndIndent(PrintWriter out) { + private void newLineAndIndent(PrintWriter out, FormatOptions format) { out.println(); - for (int i = 0; i < depth; i++) { - out.print(INDENT); + if (format.shouldIndent) { + for (int i = 0; i < depth; i++) { + out.print(INDENT); + } } } @@ -169,20 +172,42 @@ final class IcuTextWriter { } } + private static final class FormatOptions { + // Only the indent flag is used + final static FormatOptions PATH_FORMAT = new FormatOptions(true, true, true); + + static FormatOptions forPath(RbPath rbPath) { + return new FormatOptions( + !rbPath.isIntPath() && !rbPath.isBinPath(), + !rbPath.endsWith(RB_SEQUENCE) && !rbPath.isBinPath(), + !rbPath.isBinPath()); + } + + final boolean shouldQuote; + final boolean shouldUseComma; + final boolean shouldIndent; + + private FormatOptions(boolean shouldQuote, boolean shouldUseComma, boolean shouldIndent) { + this.shouldQuote = shouldQuote; + this.shouldUseComma = shouldUseComma; + this.shouldIndent = shouldIndent; + } + } + /** Inserts padding and values between braces. */ + // TODO: Get rid of the need for icuDataName by adding type information to RbPath. private boolean appendValues( - String name, RbPath rbPath, List values, PrintWriter out) { + String icuDataName, RbPath rbPath, List values, PrintWriter out) { RbValue onlyValue; boolean wasSingular = false; - boolean quote = !rbPath.isIntPath(); - boolean isSequence = rbPath.endsWith(RB_SEQUENCE); - if (values.size() == 1 && !mustBeArray(true, name, rbPath)) { + FormatOptions format = FormatOptions.forPath(rbPath); + if (values.size() == 1 && !mustBeArray(true, icuDataName, rbPath)) { onlyValue = values.get(0); - if (onlyValue.isSingleton() && !mustBeArray(false, name, rbPath)) { + if (onlyValue.isSingleton() && !mustBeArray(false, icuDataName, rbPath)) { // Value has a single element and is not being forced to be an array. String onlyElement = Iterables.getOnlyElement(onlyValue.getElements()); - if (quote) { + if (format.shouldQuote) { onlyElement = quoteInside(onlyElement); } // The numbers below are simply tuned to match the line wrapping in the original @@ -192,7 +217,7 @@ final class IcuTextWriter { int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length()); if (onlyElement.length() <= maxWidth) { // Single element for path: don't add newlines. - printValue(out, onlyElement, quote); + printValue(out, onlyElement, format); wasSingular = true; } else { // Element too long to fit in one line, so wrap. @@ -200,23 +225,23 @@ final class IcuTextWriter { for (int i = 0; i < onlyElement.length(); i = end) { end = goodBreak(onlyElement, i + maxWidth); String part = onlyElement.substring(i, end); - newLineAndIndent(out); - printValue(out, part, quote); + newLineAndIndent(out, format); + printValue(out, part, format); } } } else { // Only one array for the rbPath, so don't add an extra set of braces. - printArray(onlyValue, quote, isSequence, out); + printElements(out, onlyValue, format); } } else { for (RbValue value : values) { if (value.isSingleton()) { // Single-value array: print normally. - printArray(value, quote, isSequence, out); + printElements(out, value, format); } else { // Enclose this array in braces to separate it from other values. open("", out); - printArray(value, quote, isSequence, out); + printElements(out, value, format); close(out); } } @@ -252,18 +277,32 @@ final class IcuTextWriter { || rbPath.startsWith(RB_METAZONE_INFO); } - private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) { - for (String v : rbValue.getElements()) { - newLineAndIndent(out); - printValue(out, quoteInside(v), quote); - if (!isSequence) { - out.print(","); + private void printElements(PrintWriter out, RbValue rbValue, FormatOptions format) { + // TODO: If "shouldUseComma" is made obsolete, just use the "else" block always. + if (rbValue.getElementsPerLine() == 1) { + for (String v : rbValue.getElements()) { + newLineAndIndent(out, format); + printValue(out, quoteInside(v), format); + if (format.shouldUseComma) { + out.print(","); + } + } + } else { + checkArgument(format.shouldUseComma, "cannot group non-sequence values"); + Iterable> partitions = + Iterables.partition(rbValue.getElements(), rbValue.getElementsPerLine()); + for (List tuple : partitions) { + newLineAndIndent(out, format); + for (String v : tuple) { + printValue(out, quoteInside(v), format); + out.print(","); + } } } } - private static void printValue(PrintWriter out, String value, boolean quote) { - if (quote) { + private static void printValue(PrintWriter out, String value, FormatOptions format) { + if (format.shouldQuote) { out.append('"').append(value).append('"'); } else { out.append(value); diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java index 37d44750b6d..d4c2227905c 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java @@ -47,6 +47,7 @@ import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.PathMatcher; import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir; +import org.unicode.icu.tool.cldrtoicu.localedistance.LocaleDistanceMapper; import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper; import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper; import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper; @@ -167,6 +168,7 @@ public final class LdmlConverter { PLURAL_RANGES(SUPPLEMENTAL), WINDOWS_ZONES(SUPPLEMENTAL), TRANSFORMS(SUPPLEMENTAL), + LOCALE_DISTANCE(SUPPLEMENTAL), KEY_TYPE_DATA(BCP47); public static final ImmutableSet ALL = ImmutableSet.copyOf(OutputType.values()); @@ -506,6 +508,10 @@ public final class LdmlConverter { write(PluralRangesMapper.process(src), "misc"); break; + case LOCALE_DISTANCE: + write(LocaleDistanceMapper.process(src), "misc"); + break; + case WINDOWS_ZONES: processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false); break; diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java index 071b04fb376..832acb6efd9 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java @@ -191,13 +191,26 @@ public final class RbPath implements Comparable { } // TODO: Remove this and isAlias() in favour of having properly typed paths. - boolean isIntPath() { - String lastElement = segments.get(segments.size() - 1); - return lastElement.endsWith(":int") || lastElement.endsWith(":intvector"); + public boolean isIntPath() { + return typeSuffixIsAnyOf(":int", ":intvector"); + } + + public boolean isBinPath() { + return typeSuffixIsAnyOf(":bin"); } public boolean isAlias() { - return getSegment(length() - 1).endsWith(":alias"); + return typeSuffixIsAnyOf(":alias"); + } + + private boolean typeSuffixIsAnyOf(String... types) { + String lastElement = getSegment(length() - 1); + for (String type : types) { + if (lastElement.endsWith(type)) { + return true; + } + } + return false; } @Override public int compareTo(RbPath other) { diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java index 886ff18a266..19ef77856c6 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java @@ -3,9 +3,10 @@ package org.unicode.icu.tool.cldrtoicu; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; -import java.util.Arrays; import java.util.Objects; +import java.util.stream.Stream; import com.google.common.collect.ImmutableList; @@ -17,20 +18,32 @@ import com.google.common.collect.ImmutableList; */ public final class RbValue { private final ImmutableList elements; + private final int elementsPerLine; /** Returns a resource bundle value of the given elements. */ public static RbValue of(String... elements) { - return of(Arrays.asList(elements)); + return new RbValue(ImmutableList.copyOf(elements), 1); } /** Returns a resource bundle value of the given elements. */ public static RbValue of(Iterable elements) { - return new RbValue(elements); + return new RbValue(ImmutableList.copyOf(elements), 1); } - private RbValue(Iterable elements) { - this.elements = ImmutableList.copyOf(elements); - checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty"); + /** Returns a resource bundle value of the given elements by consuming the given stream. */ + public static RbValue of(Stream elements) { + return new RbValue(elements.collect(toImmutableList()), 1); + } + + private RbValue(ImmutableList elements, int elementsPerLine) { + checkArgument(!elements.isEmpty(), "Resource bundle values cannot be empty"); + checkArgument(elementsPerLine > 0, "invalid elements per line: %s", elementsPerLine); + this.elements = elements; + this.elementsPerLine = elementsPerLine; + } + + public RbValue elementsPerLine(int n) { + return new RbValue(elements, n); } /** Returns the non-empty list of value elements. */ @@ -42,10 +55,14 @@ public final class RbValue { * Returns whether this is a single element value. Singleton values are treated different when * writing out ICU data files. */ - public boolean isSingleton() { + boolean isSingleton() { return elements.size() == 1; } + int getElementsPerLine() { + return elementsPerLine; + } + @Override public int hashCode() { return Objects.hashCode(elements); } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTable.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTable.java new file mode 100644 index 00000000000..c82cfa23e1b --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTable.java @@ -0,0 +1,554 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.ibm.icu.impl.locale.LocaleDistance.DISTANCE_SKIP_SCRIPT; +import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_LANG_DISTANCE; +import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_REGION_DISTANCE; +import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_SCRIPT_DISTANCE; +import static com.ibm.icu.impl.locale.LocaleDistance.IX_LIMIT; +import static com.ibm.icu.impl.locale.LocaleDistance.IX_MIN_REGION_DISTANCE; +import static java.util.Arrays.asList; + +import java.util.Arrays; +import java.util.Map; +import java.util.logging.Logger; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Table; +import com.google.common.collect.TreeBasedTable; + +/** + * Represents the conceptual distance between pairs of language specifications. + * + *

Mappings for {@code (desired, supported)} pairs are added at one of three + * levels in the table; language, script and region. Distances can be resolved at + * any level in the table (e.g. {@code ("en","fr")}, {@code ("en_Latn","ru_Cyrl")} + * or {@code ("en_Latn_GB", "en_Latn_AU")}). + * + *

However in reality the "regions" in the table are actually "partition IDs" + * representing groups of regions with the same language characteristics. For more + * information on partitions and how they are generated, see {@link PartitionInfo}. + * + *

This is mentioned here because anyone debugging this code might be surprised + * to see values like {@code "5"} for a "region" in the code. Using the term + * "region" matches the conceptual level of the data and is more familiar to most + * people, whereas "partition ID" would probably be jarring. + * + *

The builder class is not resuable, and once a table is built, the builder is + * invalid. Furthermore, since the table data itself is mutable, care must be taken + * to avoid modifying either the Trie or the returned distance array. + * + *

Note that internally the {@code '*'} character used as a wildcard for subtags + * is replaced by the {@code '�'} character (a.k.a ANY), whenever a subtag is + * passed into the API. This is because the underlying Trie structure generated by + * the distance table reserves {@code '*'} for a different purpose. This difference + * is encapsulated within this class and the {@link Trie} class only. + */ +final class DistanceTable { + private static final Logger logger = Logger.getLogger(DistanceTable.class.getName()); + + // Represents a wildcard match in the data table (the equivalent of '*' in + // locale subtag). Any incoming subtags are normalized to + // convert '*' to this character by the builder. + private static final String ANY = "�"; + + // Distances must be in the range [0-127] because bit 7 of the distance value + // is used for a special flag (DISTANCE_SKIP_SCRIPT). Setting the explicit max + // to 100 is just a more human readable maximum that satisfies that constraint. + private static final int MAX_REGION_DISTANCE = 100; + + static final class Builder { + private final Node rootNode = new Node(-1); + private int minRegionDistance = MAX_REGION_DISTANCE; + + private Builder() {} + + /** + * Adds a distance to the table between the specified and desired tuples. + * This method takes 1, 2 or 3 sequential {@code (desired, supported)} pairs + * of values corresponding to language subtags, script subtags and regions + * (partition IDs). All values can be the wildcard '*'. + */ + public void addDistance(int distance, boolean oneway, String... args) { + MappingKey key = MappingKey.fromSubtags(args, distance); + logger.fine(key::toString); + // Minimum region distance needs to be tracked specially. + if (key.getDepth() == 3 && distance < minRegionDistance) { + minRegionDistance = distance; + } + addMapping(key); + if (!oneway && !key.isSymmetrical()) { + addMapping(key.reverse()); + } + } + + private void addMapping(MappingKey key) { + rootNode.addExplicitMapping(key); + if (key.hasWildcardMappings()) { + rootNode.addWildcardMappings(key); + } + } + + /** Returns the final minimized distance table information. */ + public DistanceTable build() { + Node defLangNode = rootNode.getAnyNode(); + checkState(defLangNode != null, "missing default language mapping: %s", rootNode); + Node defScriptNode = defLangNode.getAnyNode(); + checkState(defScriptNode != null, "missing default script mapping: %s", rootNode); + Node defRegionNode = defScriptNode.getAnyNode(); + checkState(defRegionNode != null, "missing default region mapping: %s", rootNode); + + // Because we prune the data table, it's important to store the default + // distance values separately. + int[] distances = new int[IX_LIMIT]; + distances[IX_DEF_LANG_DISTANCE] = defLangNode.distance; + distances[IX_DEF_SCRIPT_DISTANCE] = defScriptNode.distance; + distances[IX_DEF_REGION_DISTANCE] = defRegionNode.distance; + distances[IX_MIN_REGION_DISTANCE] = minRegionDistance; + + // Having determined the distances, prune the Trie to remove any sub-tables + // where distances could only be determined to be the default value (i.e. + // where the existence of that sub-table has no effect). + pruneDefaultDistances(defScriptNode.distance, defRegionNode.distance); + return new DistanceTable(rootNode, distances); + } + + @Override + public String toString() { + return String.format("minimum region distance: %d\n%s\n", minRegionDistance, rootNode); + } + + private void pruneDefaultDistances(int defScriptDistance, int defRegionDistance) { + logger.fine("==== pruning subtables ===="); + rootNode.subtables.values().forEach(langNode -> { + langNode.subtables.values().forEach(scriptNode -> { + if (scriptNode.subtables.size() == 1) { + // If a script node *only* contains region data with the default + // region distance, that region data can be removed. Since region + // is the lowest level, there's no need to worry about "skipping" + // anything during lookup (unlike the case below). + Node defRegionNode = scriptNode.getAnyNode(); + checkState(defRegionNode != null, + "missing default region node for script: %s", scriptNode); + if (defRegionNode.distance == defRegionDistance) { + scriptNode.subtables.clear(); + } + } + }); + // Do the pruning in the "upwards" phase of visitation (after recursion) so + // if script subtables are pruned, it's visible here. + if (langNode.subtables.size() == 1) { + // If a language node *only* contains script data with the default + // script distance, we can't just remove it (because it might contain + // region data). + Node defScriptNode = langNode.getAnyNode(); + if (defScriptNode.distance == defScriptDistance) { + checkState(defScriptNode != null, + "missing default script node for language: %s", langNode); + if (defScriptNode.subtables.isEmpty()) { + // If the default script node has no region data, remove it. + langNode.subtables.clear(); + } else { + // Otherwise mark script data as "skippable", which indicates + // it should be written in a compact form in the Trie (while + // retaining any region data as normal). + langNode.distance |= DISTANCE_SKIP_SCRIPT; + } + } + } + }); + // After pruning we don't expect any data in the top-level default table. + checkState(rootNode.getAnyNode().subtables.isEmpty(), + "invalid table state: %s", rootNode.getAnyNode()); + rootNode.subtables.rowMap().remove(ANY); + } + } + + public static Builder builder() { + return new Builder(); + } + + private final Node rootNode; + private final int[] distances; + + private DistanceTable(Node rootNode, int[] distances) { + this.rootNode = rootNode; + this.distances = distances; + } + + public Trie getTrie() { + Trie trie = new Trie(); + rootNode.writeTo(trie.root()); + return trie; + } + + public int[] getDefaultDistances() { + return distances; + } + + @Override + public String toString() { + return String.format("default distances: %s\n%s\n", Arrays.toString(distances), rootNode); + } + + private static final class Node { + private final Table subtables = TreeBasedTable.create(); + // Distance for the lookup so far (-1 for top level nodes). + private int distance; + + Node(int distance) { + checkArgument(distance >= -1, "invalid distance: %s", distance); + this.distance = distance; + } + + /** Returns the subtable node for the top-level mapping of a key. */ + private Node getNode(MappingKey key) { + return subtables.get(key.getDesired(), key.getSupported()); + } + + /** Returns the subtable node for the {@code } mapping. */ + Node getAnyNode() { + return subtables.get(ANY, ANY); + } + + void addExplicitMapping(MappingKey key) { + if (key.isLeaf()) { + if (!putIfAbsent(key)) { + logger.fine(() -> String.format("Ignore existing mapping: %s", key)); + } + } else { + getIntermediateNode(key).addExplicitMapping(key.getSuffix()); + } + } + + void addWildcardMappings(MappingKey key) { + if (key.isLeaf()) { + putIfAbsent(key); + } else if (key.isWildcard()) { + // An intermediate wildcard mapping is applied to all existing sub-nodes. + // NOTE: This will need to change if we want to support "mixed" wildcard mappings. + for (Node node : subtables.values()) { + node.addWildcardMappings(key.getSuffix()); + } + } else { + // An explicit intermediate mapping only affects an existing exact match. + Node node = getNode(key); + if (node != null) { + node.addWildcardMappings(key.getSuffix()); + } + } + } + + /** + * Adds a new mapping to this node with the specified distance if it didn't already + * exist. + * + *

Note: If a mapping already exists, then this method has no effect (even if the + * existing distance differs from the given distance). This is necessary to for two + * reasons: + *

    + *
  1. An earlier match rule may have set an explicit value for the mapping, + * and we subsequently try to set a default value (via a wildcard mapping). + * This should be ignored, since we want the non-default value to win. + * This means it's important to always have explicit {@code } + * rules before any related wildcard rules in the CLDR data. + * + *
  2. A preferential {@code } rule appears earlier in CLDR data. + * This occurs because of the way partitions are defined and allows for two + * distinct {@code } rules to generate the same mapping (with + * different distances). This is because region variables reference sets of + * partition IDs and these are not always disjoint (e.g. "en_*_$!enUS" and + * "en_*_GB" both contain the partition ID for "GB"). + *
+ * + * @return true if a new mapping was added, or if the distances were equal (i.e. + * the operation was idempotent). + */ + private boolean putIfAbsent(MappingKey key) { + Node node = getNode(key); + if (node == null) { + logger.fine(() -> String.format("add: %s", key)); + subtables.put(key.getDesired(), key.getSupported(), new Node(key.getDistance())); + return true; + } + return (key.getDistance() == node.distance); + } + + /** + * Returns a sub-node corresponding to the given {@code (desired, supported)} mapping. + * If the node already exists, it is simply returned, otherwise a new node is created + * and any existing wildcard mappings are copied into it. + */ + private Node getIntermediateNode(MappingKey key) { + Node node = getNode(key); + if (node == null) { + // This is expected to succeed because match rules are given in length + // order (i.e. language only before language+script etc.) and we always + // expect each group to end with an mapping for the default + // distance. Thus, for any longer match rule, we should find (at least) + // the node when looking for intermediate nodes. + // + // NOTE: Currently (desired==ANY) if-and-only-if (supported=ANY), so the + // only non-exact match we can get here is the node. If we ever + // allow a mix of wildcard/non-wildcard keys, replace the getAnyNode() call + // with something like the line below: + // ---- + // Node wildcardMatch = Iterables.find( + // asList(getNode(desired, ANY), getNode(ANY, supported), getNode(ANY,ANY)), + // Objects::nonNull); + // ---- + Node wildcardMatch = getAnyNode(); + checkState(wildcardMatch != null, "missing mapping: %s", this); + // Default distances are the distance between any two *different* unknown + // subtags (so if the subtags are the same, the distance is zero). + int distance = key.getDesired().equals(key.getSupported()) ? 0 : wildcardMatch.distance; + node = new Node(distance); + node.copySubtablesFrom(wildcardMatch); + subtables.put(key.getDesired(), key.getSupported(), node); + } + return node; + } + + /** Copies all subtable mappings from the given node into this one. */ + private void copySubtablesFrom(Node src) { + checkState(subtables.isEmpty()); + src.subtables.cellSet().forEach( + c -> subtables.put(c.getRowKey(), c.getColumnKey(), new Node(c.getValue().distance))); + } + + /** + * Writes all the mappings in the distance table sequentially to given Trie in sorted + * table order. + * + *

Mappings are written in a top-down recursive visitation with sub-tables inheriting + * the current prefix from parent tables via the given Trie span. At each level any + * mapped distances are written before recursing into the sub-tables. + */ + private void writeTo(Trie.Span trieSpan) { + if (distance >= 0 && (distance & DISTANCE_SKIP_SCRIPT) != 0) { + // If a node has a distance set and has been explicitly marked as "skippable", + // then write the "default" subtable using the current Trie prefix (effectively + // having an "empty" prefix for this case). + getAnyNode().writeTo(trieSpan); + } else { + // In the normal case, just write the mappings explicitly. + subtables.rowMap().forEach( + (desired, supportedNodes) -> writeSupported(trieSpan, desired, supportedNodes)); + } + } + + private void writeSupported(Trie.Span trieSpan, String desired, Map supportedNodes) { + // Collapse any (desired=ANY, supported=ANY) mappings into a single '*' in the trie. + if (desired.equals(ANY)) { + // If desired is ANY, the only supported subtag must also be ANY. + Node node = supportedNodes.get(ANY); + checkState(node != null && supportedNodes.size() == 1, + "invalid supported subtags for desired='ANY': %s", supportedNodes); + // Remember that ANY != "*", even though it corresponds to "*" in the original + // language match rules. Putting "*" in a Trie means something different (but + // similar enough to be a bit confusing). + trieSpan.with("*", node::writeDistancePlusSubtables); + } else { + // In the general case, just write the distance mapping. + trieSpan.with(desired, withDesiredSpan -> + supportedNodes.forEach((supported, node) -> { + checkState(!supported.equals(ANY), + "unexpected supported='ANY' subtag: %s", supported); + withDesiredSpan.with(supported, node::writeDistancePlusSubtables); + }) + ); + } + } + + // Writes the distance of this node to the given trie, then recursively writes any + // subtable information. + private void writeDistancePlusSubtables(Trie.Span trieSpan) { + trieSpan.putPrefixAndValue(distance); + writeTo(trieSpan); + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder("distance: ").append(distance).append('\n'); + return appendToString("", buffer).toString(); + } + + private StringBuilder appendToString(String indent, StringBuilder buffer) { + // Top level values are not padded with tabs. + String rowIndent = indent.isEmpty() ? "" : "\t"; + for (Map.Entry> row : subtables.rowMap().entrySet()) { + buffer.append(rowIndent).append(row.getKey()); + // First column extends the current row, so single tab indent. + String colIndent = "\t"; + for (Map.Entry col : row.getValue().entrySet()) { + buffer.append(colIndent).append(col.getKey()); + Node subnode = col.getValue(); + buffer.append('\t').append(subnode.distance); + // Append any sub-nodes (starting on the same line). + subnode.appendToString(indent + "\t\t\t", buffer).append('\n'); + // Later columns need full indent (including skipping row key). + colIndent = indent + '\t'; + } + // Later rows need full indent. + rowIndent = indent; + } + return buffer; + } + } + + /** + * Excapsulates a sequence of {@code } pairwise mappings over + * language, script and region, with an associated distance. This is an alternate + * way to represent a mapping of desired and supported language match rules. + * + *

For example: + *

{@code
+     *   
+     * }
+ * results in a set of keys of the form: + *
{@code
+     *    ->  ->  = 3
+     * }
+ * where the "region" part {@code } is constructed from all the possible + * combinations of partition IDs associated with the original region variables. + * + *

Mapping keys have several useful properties: + *

    + *
  • They can be reversed (e.g. {@code -> = N} becomes + * {@code -> = N}). + *
  • They can be symmetrical (e.g. {@code -> = N}), in which + * case the reversed key is the same as the original. + *
  • They can have wildcard mappings (i.e. {@code }). + *
  • They can produce "suffix" keys (e.g. the suffix of + * {@code -> = N} is {@code = N}). + *
+ */ + private static final class MappingKey { + /** + * Returns a new key from the specified subtag pairs, converting {@code '*'} + * subtags to the special {@code ANY} string and performing consistency checks. + * + * @param subtagPairs a sequence of {@code } pairs. + * @param distance the distance associated with the subtag mapping. + */ + static MappingKey fromSubtags(String[] subtagPairs, int distance) { + int pairCount = subtagPairs.length; + checkArgument(pairCount == 2 || pairCount == 4 || pairCount == 6, + "invalid number of arguments (expected 1, 2 or 3 pairs): %s", asList(subtagPairs)); + ImmutableList.Builder keyPairs = ImmutableList.builder(); + for (String subtag : subtagPairs) { + keyPairs.add(fixAny(subtag)); + } + return new MappingKey(keyPairs.build(), distance, false); + } + + // Converts a '*' (from a subtag) into the wildcard match character used by the Trie. + // The Trie uses '*' to mean something else, so we convert it at the boundary. + private static String fixAny(String subtag) { + return subtag.equals("*") ? ANY : subtag; + } + + private final ImmutableList pairs; + private final int distance; + private final boolean isReversed; + private final boolean isSymmetrical; + private final boolean hasWildcardMappings; + + private MappingKey(ImmutableList pairs, int distance, boolean isReversed) { + this.pairs = pairs; + this.distance = distance; + this.isReversed = isReversed; + checkArgument(distance >= 0 && distance <= MAX_REGION_DISTANCE, + "invalid mapping key distance: %s", distance); + // Check that if a key has "ANY" mappings, it is consistent. We expect to only + // get pairs (e.g. not or ). + boolean isSymmetrical = true; + boolean hasWildcardMappings = false; + for (int i = 0; i < pairs.size(); i += 2) { + String desired = pairs.get(i); + String supported = pairs.get(i + 1); + checkArgument(desired.equals(ANY) == supported.equals(ANY), + "invalid mapping key pairs: %s", pairs); + hasWildcardMappings |= desired.equals(ANY); + isSymmetrical &= desired.equals(supported); + } + this.isSymmetrical = isSymmetrical; + this.hasWildcardMappings = hasWildcardMappings; + } + + /** Returns the "desired" value of the current (top-level) mapping. */ + String getDesired() { + return pairs.get(isReversed ? 1 : 0); + } + + /** Returns the "supported" value of the current (top-level) mapping. */ + String getSupported() { + return pairs.get(isReversed ? 0 : 1); + } + + /** Returns the non-negative distance mapped to by this key. */ + int getDistance() { + return distance; + } + + /** + * Returns the number of {@code } mappings in this key; this is + * either 1 (language-only), 2 (language & script) or 3 (language, script & region). + */ + int getDepth() { + return pairs.size() / 2; + } + + /** Returns true if this key does not have a suffix. */ + boolean isLeaf() { + return getDepth() == 1; + } + + /** + * Returns if any of the {@code } mappings are {@code }. + */ + boolean hasWildcardMappings() { + return hasWildcardMappings; + } + + /** + * Returns if the top-level {@code } mapping is {@code }. + */ + boolean isWildcard() { + return getDesired().equals(ANY); + } + + /** + * Returns if this key is pair-wise symmetrical (e.g. {@code " -> = N"}). + * Symmetrical mappings don't need to be added in reverse. + */ + boolean isSymmetrical() { + return isSymmetrical; + } + + /** Returns a new key where each {@code } mapping is reversed. */ + MappingKey reverse() { + checkState(!isReversed, "cannot revese a reversed key"); + return new MappingKey(pairs, distance, true); + } + + /** + * Returns the suffix of this non-leaf key with the top-level mapping removed. For + * example, the suffix of {@code " -> = N"} is {@code " = N"}). + */ + MappingKey getSuffix() { + checkState(!isLeaf(), "cannot get 'next' for an empty key"); + return new MappingKey(pairs.subList(2, pairs.size()), distance, isReversed); + } + + @Override + public String toString() { + return isLeaf() + ? String.format("<%s, %s> = %d", getDesired(), getSupported(), getDistance()) + : String.format("<%s, %s> -> %s", getDesired(), getSupported(), getSuffix()); + } + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Indexer.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Indexer.java new file mode 100644 index 00000000000..abf1b817763 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Indexer.java @@ -0,0 +1,46 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkNotNull; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; + +/** + * Returns a canonicalized value for each unique value encountered, the memoized value is + * created using the zero-based index of the value and the given transformation function. + */ +final class Indexer implements Function { + /** Returns a plain indexer which returns the index directly. */ + public static Indexer create() { + return create(Function.identity()); + } + + /** Returns an indexer which transforms the returned index by the given function. */ + public static Indexer create(Function convertIndexFn) { + return new Indexer<>(convertIndexFn); + } + + private final Map indexMap = new LinkedHashMap<>(); + private final Function convertIndexFn; + + private Indexer(Function convertIndexFn) { + this.convertIndexFn = checkNotNull(convertIndexFn); + } + + /** Memoizes the given value and returns the derived value. */ + @Override + public R apply(T value) { + indexMap.putIfAbsent(checkNotNull(value), indexMap.size()); + return convertIndexFn.apply(indexMap.get(value)); + } + + /** Returns a set of the indexed values, in the order they were first encountered. */ + public Set getValues() { + return Collections.unmodifiableSet(indexMap.keySet()); + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilder.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilder.java new file mode 100644 index 00000000000..e81684e4f77 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilder.java @@ -0,0 +1,343 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSetMultimap; +import com.google.common.collect.ImmutableSortedMap; +import com.ibm.icu.impl.locale.LSR; +import com.ibm.icu.impl.locale.XLikelySubtags; +import org.unicode.cldr.api.AttributeKey; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.PathMatcher; + +import java.util.Comparator; +import java.util.Map; +import java.util.TreeMap; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static com.google.common.base.Preconditions.*; +import static com.google.common.base.Strings.nullToEmpty; +import static org.unicode.cldr.api.CldrData.PathOrder.DTD; + +/** + * Generates likely subtag information from CLDR supplemental data. + * + *

Likely subtag information and language aliases are combined to produce a + * Trie table of lookup data to canonicalize any incoming language ID to its + * most likely fully qualified form. + */ +final class LikelySubtagsBuilder { + private static final Logger logger = Logger.getLogger(LikelySubtagsBuilder.class.getName()); + + private static final PathMatcher ALIAS = + PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]"); + + private static final PathMatcher LIKELY_SUBTAG = + PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]"); + private static final AttributeKey SUBTAG_FROM = AttributeKey.keyOf("likelySubtag", "from"); + private static final AttributeKey SUBTAG_TO = AttributeKey.keyOf("likelySubtag", "to"); + + // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). + private static final Splitter LIST_SPLITTER = + Splitter.on(' ').trimResults().omitEmptyStrings(); + + // A language identifier is "xx", "xx_Yyyy", "xx_ZZ" or "xx_Yyyy_ZZ". + private static final Pattern LOCALE_ID = + Pattern.compile("([a-z]{2,3})(?:_([A-Z][a-z]{3}))?(?:_([A-Z]{2}|[0-9]{3}))?"); + + // While likely subtags are only separated by '_', language aliases can use '-' for + // legacy values. E.g.: + // + // Territory aliases never have a separator, so are always "simple". + private static final CharMatcher ALIAS_SEPARATOR = CharMatcher.anyOf("-_"); + + // This is a bit of a hack to let this newer implementation behave exactly like the original + // ICU4J version of the code. In particular, this version of the code normalizes the keys of + // the LSR table to "*" earlier than before (previously the "special" keys were "und" for + // the top-level language subtags and "" for script or region). By normalizing earlier, + // there's no longer any reason to have special case code in the Trie logic, but if we just + // do that, the table keys are now sorted differently. + // + // Normally sort order wouldn't matter, when writing the Trie, but in order to demonstrate + // that this code produces the same binary output as before, the old ordering is replicated. + // + // TODO: When the dust settles, consider moving this to a star-first or star-last ordering?? + private static Comparator sortingStarLike(String t) { + return Comparator.comparing(x -> x.equals("*") ? t : x); + } + + private static final Comparator LSR_TABLE_ORDER = sortingStarLike("und"); + private static final Comparator SUBTABLE_ORDER = sortingStarLike(""); + + /** Possible alias types. */ + private enum AliasType { + LANGUAGE("languageAlias"), + TERRITORY("territoryAlias"); + + private final String elementName; + private final AttributeKey typeKey; + private final AttributeKey reasonKey; + private final AttributeKey replacementKey; + + AliasType(String elementName) { + this.elementName = elementName; + this.typeKey = AttributeKey.keyOf(elementName, "type"); + this.reasonKey = AttributeKey.keyOf(elementName, "reason"); + this.replacementKey = AttributeKey.keyOf(elementName, "replacement"); + } + } + + /** Alias mappings for base languages and territories. */ + private static final class Aliases { + /** + * Returns the alias mapping for the given type. Note that for language aliases, + * only "simple" aliases (between base languages) are mapped. + */ + public static Aliases getAliases(CldrData supplementalData, AliasType type) { + ImmutableSortedMap.Builder canonicalMap = + ImmutableSortedMap.naturalOrder(); + supplementalData.accept(DTD, v -> { + CldrPath path = v.getPath(); + if (ALIAS.matches(path) && path.getName().equals(type.elementName)) { + // TODO: Find out why we ignore "overlong" aliases? + String aliasFrom = v.get(type.typeKey); + if (isSimpleAlias(aliasFrom) && !v.get(type.reasonKey).equals("overlong")) { + // Replacement locale IDs must be non-empty (but can be a list) and we + // use only the first (default) mapping. + String aliasTo = LIST_SPLITTER.splitToList(v.get(type.replacementKey)).get(0); + if (isSimpleAlias(aliasTo)) { + canonicalMap.put(aliasFrom, aliasTo); + } + } + } + }); + return new Aliases(canonicalMap.build()); + } + + // A simple language alias references only a base language (territory alias are + // always "simple" so this check is harmless). + private static boolean isSimpleAlias(String localeId) { + return ALIAS_SEPARATOR.matchesNoneOf(localeId); + } + + private final ImmutableSortedMap toCanonical; + private final ImmutableSetMultimap toAliases; + + private Aliases(ImmutableSortedMap toCanonical) { + this.toCanonical = checkNotNull(toCanonical); + this.toAliases = toCanonical.asMultimap().inverse(); + } + + /** Returns the alias-to-canonical-value mapping. */ + public ImmutableSortedMap getCanonicalMap() { + return toCanonical; + } + + /** + * Returns the aliases for a given canonical value (if there are no aliases + * then a singleton set containing the given canonical value is returned). + */ + public ImmutableSet getAliases(String canonical) { + ImmutableSet aliases = toAliases.get(canonical); + return aliases.isEmpty() ? ImmutableSet.of(canonical) : aliases; + } + } + + public static XLikelySubtags.Data build(CldrData supplementalData) { + // Build the table of LSR data from CLDR aliases and likely subtag information. + Aliases languageAliases = Aliases.getAliases(supplementalData, AliasType.LANGUAGE); + Aliases regionAliases = Aliases.getAliases(supplementalData, AliasType.TERRITORY); + Map>> lsrTable = + makeTable(languageAliases, regionAliases, supplementalData); + + // In the output Trie we must reference LSR instance by their special index + // (which is calculated by client code in order to lookup values). + // + // Note: We could pre-load this indexer with common locales to give them small + // indices, and see if that improves performance a little. + Indexer lsrToIndex = Indexer.create(); + + // Reserve index 0 as "no value": + // The runtime lookup returns 0 for an intermediate match with no value, so we + // need that index to be reserved by something (but the value is arbitrary). + lsrToIndex.apply(lsr("", "", "")); + // Reserve index 1 for SKIP_SCRIPT: + // The runtime lookup returns 1 for an intermediate match with a value. + // This value is also arbitrary so use a value that is easy to debug. + lsrToIndex.apply(lsr("skip", "script", "")); + + // Build the Trie of the LSR table data. + Trie trie = writeLsrTable(lsrTable, lsrToIndex); + + // Note: Using XLikelySubtags as a fairly "dumb" container for the return values + // requires us to do slightly awkward things like passing mutable arrays around, but + // the advantage it has is that this data structure is also what's used in client code, + // so if the likely subtags data changes, it will be a forcing function to change this + // code. + return new XLikelySubtags.Data( + languageAliases.getCanonicalMap(), + regionAliases.getCanonicalMap(), + trie.toByteArray(), + lsrToIndex.getValues().toArray(new LSR[0])); + } + + private static Trie writeLsrTable( + Map>> languages, + Indexer lsrToIndex) { + + Trie trie = new Trie(); + Trie.Span rootSpan = trie.root(); + languages.forEach( + (language, scripts) -> rootSpan.with( + language, + span -> writeScripts(span, scripts, lsrToIndex))); + return trie; + } + + private static void writeScripts( + Trie.Span languageSpan, Map> scripts, Indexer lsrToIndex) { + checkArgument(!scripts.isEmpty(), "invalid script table: %s", scripts); + // If we only have '*' for scripts, but there is more than one region then we can prune + // the Trie at the script level and just write ":". However in + // order to let the lookup code know that it should not expect a script prefix for the + // following entries, we must add the special "skip" value before writing the regions. + // + // However if there is also only one region, we can just write ":" and + // must avoid adding the "skip" value. + if (scripts.size() == 1) { + // We already checked '*' is in every scripts table. + Map regions = scripts.get("*"); + if (regions.size() > 1) { + languageSpan.putPrefixAndValue(XLikelySubtags.SKIP_SCRIPT); + } + writeRegions(languageSpan, regions, lsrToIndex); + } else { + scripts.forEach( + (script, regions) -> languageSpan.with( + script, + span -> writeRegions(span, regions, lsrToIndex))); + } + } + + private static void writeRegions( + Trie.Span languageOrScriptSpan, Map regions, Indexer lsrToIndex) { + checkArgument(!regions.isEmpty(), "invalid region table: %s", regions); + // Prune anything ending with '*' (either or ) + // by writing the value immediately and omitting the '*' from the Trie. + if (regions.size() == 1) { + // We already checked '*' is in every region table. + languageOrScriptSpan.putPrefixAndValue(lsrToIndex.apply(regions.get("*"))); + } else { + regions.forEach( + (region, lsr) -> languageOrScriptSpan.with( + region, + span -> span.putPrefixAndValue(lsrToIndex.apply(lsr)))); + } + } + + private static Map>> makeTable( + Aliases languageAliases, Aliases regionAliases, CldrData supplementalData) { + + Map>> lsrTable = new TreeMap<>(LSR_TABLE_ORDER); + + // set the base data + supplementalData.accept(DTD, v -> { + CldrPath path = v.getPath(); + if (LIKELY_SUBTAG.matches(path)) { + // Add the canonical subtag mapping. + LSR source = lsrFromLocaleID(v.get(SUBTAG_FROM)); + LSR target = lsrFromLocaleID(v.get(SUBTAG_TO)); + set(lsrTable, source, target); + + // Add all combinations of language and region aliases. This lets the + // matcher process aliases in locales in a single step. + for (String languageAlias : languageAliases.getAliases(source.language)) { + for (String regionAlias : regionAliases.getAliases(source.region)) { + if (languageAlias.equals(source.language) && regionAlias.equals(source.region)) { + continue; + } + set(lsrTable, languageAlias, source.script, regionAlias, target); + } + } + } + }); + + // Add the special case for "und-Latn" => "en-Latn-US" (which is a bit of a + // hack for language matching). + // TODO: Find out the history of this line and document it better. + set(lsrTable, "und", "Latn", "", lsr("en", "Latn", "US")); + logger.fine(lsrTable::toString); + + // Ensure that if "und-RR" => "ll-Ssss-RR", then we also add "Ssss" => "RR". + // For example, given: + // + // we add an additional mapping for "und-Latn-GH" => "ak-Latn-GH" since there + // will be cases where the language subtag is just missing in data, but given + // the script and region we can at least make a best guess. + // + // Note: We can't move this code after the checks below because it might add + // more mappings which then need to be checked. However realistically, the only + // time the mapping "*" -> "*" would not appear is if the likely subtag data was + // completely broken (since it implies no region-only mappings). + checkState(lsrTable.containsKey("*") && lsrTable.get("*").containsKey("*"), + "missing likely subtag data (no default region mappings): %s", lsrTable); + lsrTable.get("*").get("*").forEach((key, lsr) -> set(lsrTable, "und", lsr.script, lsr.region, lsr)); + + // Check that every level has "*" (mapped from "und" or ""). + lsrTable.forEach((lang, scripts) -> { + checkArgument(scripts.containsKey("*"), "missing likely subtag mapping for: %s", asLocale(lang)); + scripts.forEach( + (script, regions) -> checkArgument(regions.containsKey("*"), + "missing likely subtag mapping for: %s", asLocale(lang, script))); + }); + return lsrTable; + } + + // Converts subtable key sequence into original locale ID (for debugging). + // asLocale("*", *", "GB") -> "und_GB" + private static String asLocale(String... parts) { + return String.format("%s%s%s", + !parts[0].equals("*") ? parts[0] : "und", + parts.length > 1 && !parts[1].equals("*") ? "_" + parts[1] : "", + parts.length > 2 && !parts[2].equals("*") ? "_" + parts[2] : ""); + } + + private static void set( + Map>> langTable, LSR key, LSR newValue) { + set(langTable, key.language, key.script, key.region, newValue); + } + + private static void set(Map>> langTable, + String language, String script, String region, LSR lsr) { + Map> scriptTable = getSubtable(langTable, subtagOrStar(language)); + Map regionTable = getSubtable(scriptTable, subtagOrStar(script)); + regionTable.put(subtagOrStar(region), lsr); + } + + private static Map getSubtable(Map> table, String subtag) { + return table.computeIfAbsent(subtag, k -> new TreeMap<>(SUBTABLE_ORDER)); + } + + private static String subtagOrStar(String s) { + checkArgument(!s.equals("*"), "language subtags should not be '*'"); + return s.equals("und") || s.isEmpty() ? "*" : s; + } + + // Parses simple locale IDs in the data, not arbitrary language tags. + private static LSR lsrFromLocaleID(String languageIdentifier) { + Matcher m = LOCALE_ID.matcher(languageIdentifier); + checkArgument(m.matches(), "invalid language identifier: %s", languageIdentifier); + return lsr(m.group(1), m.group(2), m.group(3)); + } + + // Lenient factory method which accepts null for missing script or region (but not language). + private static LSR lsr(String language, String script, String region) { + return new LSR(checkNotNull(language), nullToEmpty(script), nullToEmpty(region), LSR.DONT_CARE_FLAGS); + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapper.java new file mode 100644 index 00000000000..5151e8023c9 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapper.java @@ -0,0 +1,491 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static java.util.Arrays.asList; +import static org.unicode.cldr.api.CldrData.PathOrder.DTD; +import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.unicode.cldr.api.AttributeKey; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.CldrValue; +import org.unicode.cldr.api.PathMatcher; +import org.unicode.icu.tool.cldrtoicu.DebugWriter; +import org.unicode.icu.tool.cldrtoicu.IcuData; +import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.RbValue; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Bytes; +import com.ibm.icu.impl.locale.LSR; +import com.ibm.icu.impl.locale.LocaleDistance; +import com.ibm.icu.impl.locale.XLikelySubtags; +import com.ibm.icu.util.ULocale; + +/** + * Mapper for generating locale distance tables from CLDR language data. + * + *

Note that this is an atypical mapper which does a lot more processing than other + * ICU mapper classes and relies on several auxilliary classes (which is why it's in a + * different package). Conceptually it's still a "mapper" though, just not a simple one. + * + *

This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project. + */ +public final class LocaleDistanceMapper { + private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName()); + + // All the language matching data comes from the "written_new" language data in + // "common/supplemental/languageInfo.xml". + private static final PathMatcher WRITTEN_LANGUAGE_PREFIX = + PathMatcher.of("//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]"); + + // Definitions of region containment variables used when expressing match distances. E.g.: + // + private static final PathMatcher VARIABLE_PATH = + WRITTEN_LANGUAGE_PREFIX.withSuffix("matchVariable[@id=*]"); + private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id"); + private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value"); + + // Language distance data, including wildcards and variable references (possibly negated). E.g.: + // + // + // + private static final PathMatcher LANGUAGE_MATCH_PATH = + WRITTEN_LANGUAGE_PREFIX.withSuffix("languageMatch[@desired=*][@supported=*]"); + private static final AttributeKey MATCH_DESIRED = + AttributeKey.keyOf("languageMatch", "desired"); + private static final AttributeKey MATCH_SUPPORTED = + AttributeKey.keyOf("languageMatch", "supported"); + private static final AttributeKey MATCH_DISTANCE = + AttributeKey.keyOf("languageMatch", "distance"); + // Optional, assume false if not present. + private static final AttributeKey MATCH_ONEWAY = + AttributeKey.keyOf("languageMatch", "oneway"); + + // Singleton element containing the list of special case "paradigm" locales, which should + // always be preferred if there is a tie. E.g.: + // + // + // Since there are no distinguishing attributes for this path, there can only be one + // instance which we can just lookup directly. + private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath( + "//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]/paradigmLocales"); + private static final AttributeKey PARADIGM_LOCALES = + AttributeKey.keyOf("paradigmLocales", "locales"); + + // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). + private static final Splitter LIST_SPLITTER = + Splitter.on(' ').trimResults().omitEmptyStrings(); + + // Output resource bundle paths, split into two basic groups for likely locale mappings + // and match data. + private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases"); + private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases"); + private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin"); + private static final RbPath LIKELY_LSRS = RbPath.of("likely", "lsrs"); + + private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin"); + private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin"); + private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions"); + private static final RbPath MATCH_PARADIGMS = RbPath.of("match", "paradigms"); + private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector"); + + // To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS"). + private static final Splitter UNDERSCORE = Splitter.on('_'); + + /** + * Processes data from the given supplier to generate locale matcher ICU data. + * + * @param src the CLDR data supplier to process. + * @return the IcuData instance to be written to a file. + */ + public static IcuData process(CldrDataSupplier src) { + return process(src.getDataForType(SUPPLEMENTAL)); + } + + @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. + static IcuData process(CldrData data) { + IcuData icuData = new IcuData("langInfo", false); + + XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data); + icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases)); + icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases)); + icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie)); + icuData.add(LIKELY_LSRS, ofLsrs(asList(likelyData.lsrs))); + + LocaleDistance.Data distanceData = buildDistanceData(data); + icuData.add(MATCH_TRIE, ofBytes(distanceData.trie)); + icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex)); + icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays)); + icuData.add(MATCH_PARADIGMS, ofLsrs(distanceData.paradigmLSRs)); + icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString))); + return icuData; + } + + /** + * A simple holder for language, script and region which allows for wildcards (i.e. "*") + * and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional + * validation is done on incoming fields as data is assumed to be correct. + */ + private static final class LsrSpec { + /** + * Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb" + * or "en_*_GB") into a structured spec. Note that if the specification string + * contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in + * the given RegionMapper builder, so the returned {@code LsrSpec} will be + * {@code "en_*_$GB"}. + */ + public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) { + List parts = UNDERSCORE.splitToList(rawSpec); + checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec); + String language = parts.get(0); + Optional script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty(); + // While parsing the region part, ensure any "bare" region subtags are converted + // to variables (e.g. "GB" -> "$GB") and registered with the parition map. + Optional region = + parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty(); + return new LsrSpec(language, script, region); + } + + // A language subtag (e.g. "en") or "*". + private final String language; + // If present, a script subtag (e.g. "Latn") or "*". + private final Optional script; + // If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*". + private final Optional regionVariable; + + private LsrSpec(String language, Optional script, Optional regionVariable) { + this.language = language; + this.script = script; + this.regionVariable = regionVariable; + // Implementation shortcuts assume: + // - If the language subtags are '*', the other-level subtags must also be '*' (if present). + // If there are rules that do not fit these constraints, we need to revise the implementation. + if (isAny(language)) { + script.ifPresent( + s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script)); + regionVariable.ifPresent( + r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable)); + } + } + + public String getLanguage() { + return language; + } + + public String getScript() { + return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this)); + } + + public String getRegionVariable() { + return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this)); + } + + public int size() { + return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1; + } + + @Override + public String toString() { + return language + script.map(s -> "_" + s).orElse("") + regionVariable.map(r -> "_" + r).orElse(""); + } + } + + /** + * Represents a {@code } rule derived from supplemental data, such as: + *

{@code
+     *   
+     * }
+ * or: + *
{@code
+     *   
+     * }
+ * + *

The job of a {@code Rule} is to provide a mechanism for capturing the data in + * {@code } elements and subsequently adding that information to a + * {@link DistanceTable.Builder} in a structured way. + */ + private static final class LanguageMatchRule { + private final LsrSpec desired; + private final LsrSpec supported; + private final int distance; + private final boolean oneway; + + public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) { + this.desired = checkNotNull(desired); + this.supported = checkNotNull(supported); + this.distance = distance; + this.oneway = oneway; + // Implementation shortcuts assume: + // - At any level, either both or neither spec subtags are *. + // If there are rules that do not fit these constraints, we need to revise the implementation. + checkArgument(desired.size() == supported.size(), + "mismatched rule specifications in: %s, %s", desired, supported); + checkArgument(isAny(desired.language) == isAny(supported.language), + "wildcard mismatch for languages in: %s, %s", desired, supported); + checkArgument(isAny(desired.script) == isAny(supported.script), + "wildcard mismatch for scripts in: %s, %s", desired, supported); + checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable), + "wildcard mismatch for languages in: %s, %s", desired, supported); + } + + int size() { + return desired.size(); + } + + boolean isDefaultRule() { + // We already know that in LsrSpec, if the language is "*" then all subtags are too. + return isAny(desired.language); + } + + /** + * Adds this rule to the given distance table, using the given partition map to + * resolve any region variables present in the desired or supported specs. + */ + void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) { + // Note that rather than using the rule's "size" to mediate the different + // cases, we could have had 3 distinct sub-types of a common rule API (e.g. + // "LanguageRule", "ScriptRule" and "RegionRule"), each with a different + // addTo() callback. However this would have been quite a lot more code + // for not much real gain. + switch (size()) { + case 1: // Language only. + distanceTable.addDistance(distance, oneway, + desired.getLanguage(), supported.getLanguage()); + break; + + case 2: // Language and script present. + distanceTable.addDistance(distance, oneway, + desired.getLanguage(), supported.getLanguage(), + desired.getScript(), supported.getScript()); + break; + + case 3: // Language, script and region variable present. + // Add the rule distance for every combination of desired/supported + // partition IDs for the region variables. This is important for + // variables like "$americas" which overlap with multiple paritions. + // + // Note that in this case (because region variables map to sets of + // partition IDs) we can get situations where "shouldReverse" is true, + // but the desired/supported pairs being passed in are identical (e.g. + // different region variables map to distinct partition groups which + // share some common elements). + // + // This is fine, providing that the distance table is going to ignore + // identical mappings (which it does). Alternatively we could just + // re-calculate "shouldReverse" inside this loop to account for partition + // IDs rather than region variables. + ImmutableSet desiredPartitionIds = + partitions.getPartitionIds(desired.getRegionVariable()); + ImmutableSet supportedPartitionIds = + partitions.getPartitionIds(supported.getRegionVariable()); + for (String desiredPartitionId : desiredPartitionIds) { + for (String supportedPartitionId : supportedPartitionIds) { + distanceTable.addDistance(distance, oneway, + desired.getLanguage(), supported.getLanguage(), + desired.getScript(), supported.getScript(), + desiredPartitionId, supportedPartitionId); + } + } + break; + + default: + throw new IllegalStateException("invalid size for LsrSpec: " + this); + } + } + + @Override + public String toString() { + return String.format( + "Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }", + desired, supported, distance, oneway); + } + } + + private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) { + // Resolve any explicitly declared region variables into the partition map. + // Territory containment information is used to recursively resolve region + // variables (e.g. "$enUS") into a collection of non-macro regions. + PartitionInfo.Builder partitionBuilder = + PartitionInfo.builder(TerritoryContainment.getContainment(supplementalData)); + supplementalData.accept(DTD, v -> { + CldrPath path = v.getPath(); + if (VARIABLE_PATH.matches(path)) { + partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE)); + } + }); + + // Parse the rules from elements. Note that the + // element is marked as "ORDERED" in the DTD, which means the elements always + // appear in the same order is in the CLDR XML file (even when using DTD order). + // + // This is one of the relatively rare situations in which using DTD order will + // not isolate the ICU data from reordering of the CLDR data. In particular this + // matters when specifying language matcher preferences (such as "en_*_GB" vs + // "en_*_!enUS"). + // + // We could almost process the rules while reading them from the source data, but + // rules may contain region codes rather than variables, and we need to create a + // variable for each such region code before the RegionMapper is built, and + // before processing the rules (this happens when the LsrSpec is parsed). + List rules = new ArrayList<>(); + supplementalData.accept(DTD, v -> { + CldrPath path = v.getPath(); + if (LANGUAGE_MATCH_PATH.matches(path)) { + int distance = Integer.parseInt(v.get(MATCH_DISTANCE)); + // Lenient against there being no "oneway" attribute. + boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY)); + LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder); + LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder); + LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway); + logger.fine(() -> String.format("rule: %s", rule)); + rules.add(rule); + } + }); + // Check that the rules are in the expected order. Rule order is important in ensuring + // data correctness and incorrect order may violate business logic assumptions later. + // TODO: Consider what other ordering/sanity checks make sense here. + for (int n = 0, prevSize = 1; n < rules.size(); n++) { + LanguageMatchRule rule = rules.get(n); + checkArgument(rule.size() >= prevSize, " elements out of order at: %s", rule); + checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()), + "missing default rule before: %s", rule); + prevSize = rule.size(); + } + checkState(rules.stream().distinct().count() == rules.size(), "duplicated rule in: %s", rules); + + // Build region partition data after all the variables have been accounted for + // (including the implicit variables found while processing LsrSpecs). + PartitionInfo partitions = partitionBuilder.build(); + + // Add all the rules (in order) to the distance table. + DistanceTable.Builder distanceTableBuilder = DistanceTable.builder(); + rules.forEach(r -> r.addTo(distanceTableBuilder, partitions)); + DistanceTable distanceTable = distanceTableBuilder.build(); + + // Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values + // requires us to do slightly awkward things, like passing mutable arrays and LSR + // instances around, but the advantage it has is that this data structure is also what's + // used in client code, so if the likely subtags data changes, it will be a forcing + // function to change this code. + return new LocaleDistance.Data( + distanceTable.getTrie().toByteArray(), + partitions.getPartitionLookupArray(), + partitions.getPartitionStrings(), + getParadigmLsrs(supplementalData), + distanceTable.getDefaultDistances()); + } + + private static Set getParadigmLsrs(CldrData supplementalData) { + // LinkedHashSet for stable order; otherwise a unit test is flaky. + CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH); + checkState(cldrValue != null, + " element was missing: %s", PARADIGM_LOCALES_PATH); + String localesList = cldrValue.get(PARADIGM_LOCALES); + checkState(localesList != null, + " 'locales' attribute was missing: %s", cldrValue); + + Set paradigmLSRs = new LinkedHashSet<>(); + for (String paradigm : LIST_SPLITTER.split(localesList)) { + LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm)); + // Clear the LSR flags to make the data equality test in LocaleDistanceTest happy. + paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS)); + } + checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs); + return paradigmLSRs; + } + + // Returns an RbValue serialized from a map as a sequence of alternating (key, value) + // pairs (formatted as one pair per line in the IcuData file). + // + // E.g. + // foo{ + // key1, value1, + // ... + // keyN, valueN, + // } + private static RbValue ofMapEntries(Map map) { + return RbValue.of( + map.entrySet().stream() + .flatMap(e -> Stream.of(e.getKey(), e.getValue())) + .collect(Collectors.toList())) + .elementsPerLine(2); + } + + // Returns an RbValue serialized from a sequence of LSR instance as a sequence of repeating + // (language, region, script) tuples (formatted as one tuple per line in the IcuData file). + // + // E.g. + // foo{ + // lang1, script1, region1, + // ... + // langN, scriptN, regionN, + // } + private static RbValue ofLsrs(Collection lsrs) { + return RbValue.of( + lsrs.stream() + .flatMap(lsr -> Stream.of(lsr.language, lsr.script, lsr.region)) + .collect(Collectors.toList())) + .elementsPerLine(3); + } + + // Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of + // hex values. This is intended only for RbPaths using the ":bin" suffix. + // + // E.g. + // foo{ + // 0123456789abcdef0123456789abcdef + // ... + // 1c0de4c0ffee + // } + // + // Note that typically no indentation is used when writting this binary "blob". + private static RbValue ofBytes(byte[] data) { + ImmutableList.Builder hexValues = ImmutableList.builder(); + List bytes = Bytes.asList(data); + for (List line : Iterables.partition(bytes, 16)) { + hexValues.add(line.stream().map(b -> String.format("%02x", b)).collect(Collectors.joining())); + } + return RbValue.of(hexValues.build()); + } + + // Returns if the subtag is the '*' wildcard. This is not to be confused with the + // "ANY" character used in DistanceTable. + private static boolean isAny(String subtag) { + return subtag.equals("*"); + } + + // Returns if the subtag exists and is the '*' wildcard. + private static boolean isAny(Optional subtag) { + return subtag.map(LocaleDistanceMapper::isAny).orElse(false); + } + + // Main method for running this mapper directly with logging enabled. + // CLDR_DIR is picked up from system properties or envirnment variables. + // Arguments: [] + public static void main(String[] args) throws IOException { + DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process); + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfo.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfo.java new file mode 100644 index 00000000000..d5cbfe7f76d --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfo.java @@ -0,0 +1,432 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.logging.Logger; + +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSetMultimap; +import com.google.common.collect.SetMultimap; +import com.google.common.collect.Sets; +import com.google.common.collect.SortedSetMultimap; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.impl.locale.LSR; + +/** + * Provides mapping arrays to quickly lookup partition information for any region + * code in client libraries. + * + *

A region's partition is defined by the set of region variables (e.g. "$enUS") + * in the CLDR data. Each unique combination of variables forms a partition, and + * groups of partitions uniquely define language distance groupings. In slightly + * mathematical terms, partition groups form an "equivalence class" for regions + * with respect to language distance. + * + *

So by determining the minimum set of partitions and partition groups, and + * assigning short IDs to them, it's possibe to create data structures which + * support all region pairings while being small and fast to access in client code. + */ +final class PartitionInfo { + private static final Logger logger = Logger.getLogger(PartitionInfo.class.getName()); + + /** + * A builder, to which region variables are added in order to define partitions + * and partition groups based on territory containment. + */ + static final class Builder { + // Possible operations to parse from a region expression (e.g. "US+005-BR"). + private static final CharMatcher REGION_OPS = CharMatcher.anyOf("+-"); + + private final TerritoryContainment territories; + private final Set variables = new HashSet<>(); + private final SortedSetMultimap regionToVariables = TreeMultimap.create(); + + private Builder(TerritoryContainment territories) { + this.territories = territories; + } + + // Returns whether the given string is a known variable or the wildcard token. + // Non variable strings (e.g. plain region codes) can be passed in and simply + // return false. + private boolean isKnownVariableOrWildcard(String variable) { + return variables.contains(variable) || variable.equals("*"); + } + + /** + * Adds a variable expression (e.g. "$foo = "US+005-BR") from CLDR data and + * fully resolves all macro regions to their contained leaf regions. + * + *

The syntax is simple for now: + *

+         *     regionSet := region ([-+] region)*
+         * 
+ * There is no precedence, so "x+y-y+z" is "(((x+y)-y)+z)", and not + * "(x+y)-(y+z)". + */ + public void addVariableExpression(String variable, String expr) { + checkState(variable.startsWith("$") && !variable.startsWith("$!"), + "invalid variable: %s", variable); + checkState(!isKnownVariableOrWildcard(variable), + "duplicate variable: %s", variable); + // Parsing also flattens the list to the corresponding leaf regions, + // so there should be no macro regions here. + Set regions = parseAndFlattenRegionExpression(expr, territories); + // Add the mappings ("$foo" -> X) and the inverse ("$!foo" -> not(X)). + // + // The reason that the inverse mapping is needed is because some rules use + // the negated form of a variable (e.g. "$!enUS") and we must be able to + // resolve the set of associated partition IDs for it. + // + // If we only wanted the set of regions for the negated variable, that + // would be trivial (and there would be no need to store the negated values) + // but because the set of partition IDs for a negated variable is NOT always + // the negated set of parition IDs for the original variable (due to the way + // partitions overlap) it's not straightforward. + // + // In other words: + // regions-for("$!foo") == !regions-for("$foo)) + // but: + // partition-ids-for("$!foo") != !partition-ids-for("$foo") + addVariable(variable, regions); + addVariable( + "$!" + variable.substring(1), + Sets.difference(territories.getLeafRegions(), regions)); + } + + private void addVariable(String variable, Iterable regions) { + checkArgument(variables.add(variable), + "variable '%s' already present in: %s", variable, regions); + for (String region : regions) { + checkArgument(!region.isEmpty(), "%s", regions); + regionToVariables.put(region, variable); + } + } + + // Parses a region expression (e.g. "US+005-BR") to a set of resolved "leaf" + // regions. + private static Set parseAndFlattenRegionExpression( + String expr, TerritoryContainment territories) { + Set regions = new TreeSet<>(); + Consumer operation = regions::add; + int last = 0; + for (int i = REGION_OPS.indexIn(expr); i != -1; i = REGION_OPS.indexIn(expr, last)) { + applyOperation(operation, expr.substring(last, i), territories); + // Set up the next operation based on the separator char ('+' or '-'). + operation = (expr.charAt(i) == '+') ? regions::add : regions::remove; + last = i + 1; + } + applyOperation(operation, expr.substring(last), territories); + return regions; + } + + private static void applyOperation( + Consumer operation, String region, TerritoryContainment territories) { + checkArgument(!region.isEmpty(), "invalid region expresson (missing region)"); + ImmutableSet contained = territories.getLeafRegionsOf(region); + if (!contained.isEmpty()) { + // For macro regions, add all their contained leaf regions (direct or indirect). + contained.forEach(operation); + } else { + // Leaf regions are just added directly. + operation.accept(region); + } + } + + /** + * Registers an implicit variable defined by a region code, and returns the new variable + * name. + * + *

This method exists because the {@code } syntax supports referencing + * regions directly, rather than just as pre-defined variables (e.g. "en_*_GB"). We still + * want to track these variables however since they may interact with macro-regions. + * + * @param regionOrVariable a region or an existing variable reference. + * @return the name of the registered variable (including '$' prefix). + */ + public String ensureVariable(String regionOrVariable) { + if (isKnownVariableOrWildcard(regionOrVariable)) { + return regionOrVariable; + } + // Here we either have a "raw" region (e.g. "GB") or an unknown variable (e.g. "$foo"). + // However all explicit variables should have already been registered, so if this does + // start with '$', then it's an error. + checkArgument(!regionOrVariable.startsWith("$"), "unregistered variable: %s", regionOrVariable); + + // This is an implicit variable, referenced by its region code, so we know that it + // can never be referenced in the negated form (i.e. "$!GB"), so we don't need to add + // the inverse mapping in the same way we do for explicitly defined variables. + // + // We also allow implicit variables to appear more than once in the list of match + // rules, so don't call addVariable() here, since that prohibits repeated addition. + // Since 'regionToVariables' is a _set_ multimap, adding implicit variables is an + // idempotent operation, so it's okay if it's done more than once. + String variable = "$" + regionOrVariable; + variables.add(variable); + regionToVariables.put(regionOrVariable, variable); + return variable; + } + + public PartitionInfo build() { + // Step 1: Map regions to a unique "partition" ID. + // + // A region's partition is the set of variables which include it, and + // variables can be explicit (e.g. "$enUS"), implicit (e.g. "$GB") or + // negated (e.g. "$!enUS). + // + // For example, region "US" is included in the variables "$americas" and + // "$enUS", but is also referenced in the "negated" variables "$!cnsar" + // and "$!maghreb", so the "partition" of "US" is: + // { $americas, $enUS, $!cnsar, $!maghreb } + // + // A partition ID is a token associated with each unique variable partition. + // + // Since other regions, such as "PR" (Puerto Rico) and "VI" (U.S. Virgin + // Islands), are also "in" the same partition as "US", they will share the + // same partition ID. + // + // However, while "CA" is also included in "$americas", it's NOT defined as + // an "$enUS" (American English) region, so its partition is: + // { $americas, $!enUS, $!cnsar, $!maghreb } + // and it will have a different partition ID. + + // Check that the region-to-partition map covers every leaf region (this + // is important to ensure partitions form a disjoint covering). + checkArgument(regionToVariables.keySet().equals(territories.getLeafRegions()), + "unexpected variable grouping (should cover all leaf regions): %s", + regionToVariables); + ImmutableMap regionToPartitionId = + mapLeafRegionsToPartitionIds(regionToVariables); + logger.fine(() -> String.format("region to partition ID: %s", regionToPartitionId)); + + // Step 2: Construct mappings to and from partition IDs, to group regions + // by the variables that define them. + + // A sorted mapping from every variable ("$foo" or "$!foo") to the IDs of + // the partitions it exists in. + // + // For example, "$americas" exists in partitions for both "$enUS" (American + // English) and "$!enUS" (non-American English) regions, so will be mapped + // to (at least) two unique parition IDs (e.g. X & Y). + // "$americas" -> { X, Y } + ImmutableSetMultimap variableToPartitionIds = + mapVariablesToPartitionIds(regionToPartitionId, regionToVariables); + logger.fine(() -> String.format("variable to partition IDs: %s", variableToPartitionIds)); + + // A sorted mapping of each macro region to the partitions it intersects + // with. Unlike leaf regions, macro regions can map to groups of partitions + // rather than just a single one. + // + // For example, the macro region "419" (Latin America) intersects with + // both partitions: + // X = {$americas, $enUS, ...} (i.e. "Americas + American English") + // and: + // Y = {$americas, $!enUS, ...} (i.e. "Americas + non-American English") + // so this map would contain: + // "419" -> { X, Y } + ImmutableSetMultimap macroRegionToPartitionIds = + mapMacroRegionsToPartitionIds(regionToPartitionId, territories); + + // Step 3: Write the sparse "region index to partition group index" lookup + // array. This is the fast lookup array used to go from LSR region index to + // the partition group IDs for that region. + // + // Note that most entries in the array are zero, since the array maps from + // all possible regions, not just ones which exist. This is a space/time + // trade-off (and the array is compressed in the ICU data files anyway). + byte[] partitionLookupArray = new byte[LSR.REGION_INDEX_LIMIT]; + String[] partitionStrings = writePartitionLookupTable( + partitionLookupArray, regionToPartitionId, macroRegionToPartitionIds); + + return new PartitionInfo(variableToPartitionIds, partitionLookupArray, partitionStrings); + } + + private static ImmutableMap mapLeafRegionsToPartitionIds( + SetMultimap regionToVariables) { + // A generator for partition IDs which returns a single ASCII character for + // each unique partition. + // + // Partition IDs are emitted into the ICU data, so it's important they are + // small and compatible with the ICU data file format. + Function, String> partitionToId = + Indexer.create(i -> { + // Must be a single 7-bit ASCII value and not '*'. This is NOT + // used as a numeric value anywhere and could end up being a non + // digit character if the number of unique partitions is > 10. + // As of June 2020, there are only 7 unique paritions. + char partitionChar = (char) ('0' + i); + checkState(partitionChar < 0x7f, "too many partitions: %s", i); + return String.valueOf(partitionChar); + }); + + // For each region, find its partition ID (based on the unique combination + // of variables that define it). + ImmutableMap.Builder regionToId = ImmutableMap.builder(); + regionToVariables.asMap().forEach( + (region, variables) -> regionToId.put(region, partitionToId.apply(variables))); + return regionToId.build(); + } + + private static ImmutableSetMultimap mapVariablesToPartitionIds( + ImmutableMap regionToPartitionId, + SortedSetMultimap regionToVariables) { + + // It's vital that this is a sorted multimap (of values as well as keys) + // since the values are later indexed and turned into partition strings + // (so stability of ID order in values is necessary). + SortedSetMultimap variableToPartitionIds = TreeMultimap.create(); + regionToVariables.asMap().forEach((region, variables) -> { + String partitionId = regionToPartitionId.get(region); + for (String variable : variables) { + variableToPartitionIds.put(variable, partitionId); + } + }); + return ImmutableSetMultimap.copyOf(variableToPartitionIds); + } + + private static ImmutableSetMultimap mapMacroRegionsToPartitionIds( + ImmutableMap regionToPartitionId, + TerritoryContainment territories) { + + // A mapping from each unique partition ID to the regions it contains. + // This mapping forms a disjoint covering of all (non-macro) regions and + // is just the "inverse" of the initial "region to partition ID" map. + // + // For example, following the examples above where: + // X = {$americas, $enUS, ...} + // and: + // Y = {$americas, $!enUS, ...} + // + // We would get something like: + // X -> {"PR", "US", "VI", ...} + // Y -> {"CA", ...} + Map> partitionToRegions = + regionToPartitionId.asMultimap().inverse().asMap(); + + // Each macro region can then be decomposed to a mapping to the unique set + // of partitions it overlaps with based on its leaf regions and the regions + // of all known partitions. + SortedSetMultimap macroToPartitions = TreeMultimap.create(); + for (String macro : territories.getMacroRegions()) { + ImmutableSet leaves = territories.getLeafRegionsOf(macro); + partitionToRegions.forEach((partition, regions) -> { + if (!Collections.disjoint(leaves, regions)) { + macroToPartitions.put(macro, partition); + } + }); + } + return ImmutableSetMultimap.copyOf(macroToPartitions); + } + + private static String[] writePartitionLookupTable( + byte[] partitionLookupArray, + ImmutableMap regionToPartitionId, + ImmutableSetMultimap macroRegionToPartitionIds) { + + // A generator for indices of partition groups, based on partition IDs. + // + // For leaf regions this generates a one-to-one mapping with the single + // partition ID, but macro regions can overlap multiple partitions. + Indexer, Byte> partitionGroupIndexer = + Indexer.create(i -> { + // The partition group index must fit in a byte. + // For Java code simplicity, we want it to also be non-negative. + // As of June 2020, there are 15 partition groups. + checkState(i <= 0x7f, "too many partition groups: %s", i); + return (byte) i.intValue(); + }); + + // The default value in the partition lookup array (index 0) is mapped to by + // any unsupported region (since "LSR.indexForRegion()" is 0). + // We must therefore reserve a special parition group index for these cases + // before adding the rest of the partitions. + partitionGroupIndexer.apply(ImmutableSet.of(".")); + + // Populate the radix-based sparse index array, where each region is converted + // to the LSR region index (which must correspond to how regions are indexed in + // the client side code). + BiConsumer> writePartitionIndex = + (region, ids) -> partitionLookupArray[LSR.indexForRegion(region)] = + partitionGroupIndexer.apply(ids); + + // Write leaf regions first (mostly to match the original code behaviour) + // and then macro regions. + // + // Convert the Map to a Map> + // to match the macro regions (even though each collection is a singleton). + regionToPartitionId.asMultimap().asMap().forEach(writePartitionIndex); + macroRegionToPartitionIds.asMap().forEach(writePartitionIndex); + + // Check invalid reigons will map to the special "missing partition" value. + checkState(partitionLookupArray[0] == 0); + + // Return the unique partition groups (sets of partition IDs) as strings + // (as a sequence of single letter partition IDs). Leaf regions will always + // have a single partition ID, but macro regions can overlap with multiple + // partitions. + return partitionGroupIndexer.getValues().stream() + .map(ids -> String.join("", ids)).toArray(String[]::new); + } + } + + /** + * Returns a builder to which variable mappings are added, from which partition + * information is derived. + */ + public static Builder builder(TerritoryContainment territories) { + return new Builder(territories); + } + + private final ImmutableSetMultimap variableToPartitionIds; + private final byte[] partitionLookupArray; + private final String[] partitionStrings; + + private PartitionInfo( + ImmutableSetMultimap variableToPartitionIds, + byte[] partitionLookupArray, + String[] partitionStrings) { + this.variableToPartitionIds = ImmutableSetMultimap.copyOf(variableToPartitionIds); + this.partitionLookupArray = partitionLookupArray; + this.partitionStrings = partitionStrings; + } + + /** + * Returns the set of partition IDs for the given variable, or {@code {"*"}} if the + * speical '*' variable was given. The returned set must be non-empty because every + * variable includes at least one region, and all regions map to a partition ID. + */ + public ImmutableSet getPartitionIds(String variable) { + if (variable.equals("*")) { + return ImmutableSet.of("*"); + } + ImmutableSet result = variableToPartitionIds.get(variable); + checkArgument(!result.isEmpty(), "variable not defined: %s", variable); + return result; + } + + /** Returns the sparse lookup array from LSR region index to partition group index. */ + public byte[] getPartitionLookupArray() { + return partitionLookupArray; + } + + /** + * Returns the partition group lookup array from partition group index to partition + * ID string. + */ + public String[] getPartitionStrings() { + return partitionStrings; + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainment.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainment.java new file mode 100644 index 00000000000..522f6e5c0a2 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainment.java @@ -0,0 +1,118 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap; + +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Pattern; + +import org.unicode.cldr.api.AttributeKey; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.PathMatcher; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSetMultimap; +import com.google.common.collect.SetMultimap; +import com.google.common.collect.Sets; +import com.google.common.collect.SortedSetMultimap; +import com.google.common.collect.TreeMultimap; + +/** + * Territory containment graph. This is built from CLDR supplemental data and + * represents all territories and their containment, including macro regions + * such as {@code "016"}. The root node of the graph is {@code "001"}. + */ +final class TerritoryContainment { + // CLDR paths for containment data. + private static final PathMatcher CONTAINMENT_PATH = + PathMatcher.of("//supplementalData/territoryContainment/group[@type=*]"); + private static final AttributeKey TYPE = AttributeKey.keyOf("group", "type"); + private static final AttributeKey CONTAINS = AttributeKey.keyOf("group", "contains"); + + // Standard CLDR list values are split by space. + // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). + private static final Splitter LIST_SPLITTER = + Splitter.on(' ').trimResults().omitEmptyStrings(); + // The world region must be the only root in the graph. + private static final String WORLD = "001"; + private static final Pattern REGION = Pattern.compile("[A-Z]{2}|[0-9]{3}"); + + /** + * Returns the territory containment information described by the given CLDR + * supplemental data. + */ + public static TerritoryContainment getContainment(CldrData supplementalData) { + // Directed, acyclic containment graph. Maps each territory to its direct contents. + // Note that since things like deprecated regions are included here, this allows + // sub-regions to have more than one parent. + SortedSetMultimap graph = TreeMultimap.create(); + supplementalData.accept(CldrData.PathOrder.DTD, v -> { + CldrPath path = v.getPath(); + if (CONTAINMENT_PATH.matches(path)) { + graph.putAll(v.get(TYPE), LIST_SPLITTER.split(v.get(CONTAINS))); + } + }); + return new TerritoryContainment(ImmutableSetMultimap.copyOf(graph)); + } + + /** Maps each macro-region to all its leaf contents (direct and indirect). */ + private final ImmutableSetMultimap macroToLeafRegions; + + private TerritoryContainment(ImmutableSetMultimap graph) { + // Do some double checking of the CLDR data. + graph.values().forEach( + r -> checkArgument(REGION.matcher(r).matches(), "bad region '%s' in: %s", r, graph)); + checkArgument(graph.containsKey(WORLD), "missing world region '%s'", WORLD); + // There should be only one "root" in the graph, so every other region should be + // contained by something. + Set allContained = ImmutableSet.copyOf(graph.values()); + Set roots = ImmutableSet.copyOf(Sets.difference(graph.keySet(), allContained)); + checkArgument(roots.equals(ImmutableSet.of(WORLD)), + "world region '%s' must be the only containment graph root (was %s)", WORLD, roots); + + // Start with a copy of the direct containment graph (but still pass in the direct + // graph to avoid issues with concurrent modification). + // If the graph is cyclic, this step will never terminate and run out of memory + // (and since this is a build-time tool, that's probably fine). + SortedSetMultimap resolved = TreeMultimap.create(graph); + resolve(WORLD, graph, resolved); + // For leaf regions (direct or indirect) just retain any sub-regions which don't + // have child regions from the resolved graph. + this.macroToLeafRegions = resolved.entries().stream() + // Only keep macro regions (leaf regions don't have child regions by definition). + .filter(e -> !graph.get(e.getKey()).isEmpty()) + // Only keep the single-region e.getValue() if it is a leaf region. + .filter(e -> graph.get(e.getValue()).isEmpty()) + .collect(toImmutableSetMultimap(Entry::getKey, Entry::getValue)); + } + + // Recursively resolve the region and its child regions. + private static Set resolve( + String region, SetMultimap graph, SetMultimap resolved) { + graph.get(region).forEach(sub -> resolved.putAll(region, resolve(sub, graph, resolved))); + return resolved.get(region); + } + + /** + * Returns the leaf regions contained in the given region (if the given region is a + * leaf region, then the empty set is returned). + */ + public ImmutableSet getLeafRegionsOf(String region) { + return macroToLeafRegions.get(region); + } + + /** Returns all leaf regions. */ + public ImmutableSet getLeafRegions() { + return macroToLeafRegions.get(WORLD); + } + + /** Returns all macro regions. */ + public ImmutableSet getMacroRegions() { + return macroToLeafRegions.keySet(); + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Trie.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Trie.java new file mode 100644 index 00000000000..2e04141f9a2 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/localedistance/Trie.java @@ -0,0 +1,112 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +import java.nio.ByteBuffer; +import java.util.function.Consumer; + +import com.ibm.icu.impl.locale.LocaleDistance; +import com.ibm.icu.util.BytesTrieBuilder; + +/** + * Trie constructed by adding "spans" of data representing prefix + * sequences with mapped values. + * + *

When a prefix needs to be added to a Trie, a new span is created to + * represents the additional data. If a final value is added to a span, then + * the current prefix data is committed to the underlying Trie as its key. + * + *

Typical use might look like: + *

{@code
+ * Trie trie = new Trie();
+ * mappedData.forEach(
+ *     (prefix, subValues) -> trie.root().with(prefix, subSpan -> process(subSpan, subValues));
+ * byte[] bytes = trie.toByteArray();
+ * }
+ * }
+ * where the {@code process} method may create more sub-spans, and eventually + * calls {@link Span#putPrefixAndValue(int)} to commit the current sequence + * of prefixes and the given value to the Trie. + * + *

Since spans share a common buffer for prefix data, it is important + * that extended spans are consumed before the parent span is used again. + * This is one reason why the API requires a consumer to be passed when a + * span is extended. + */ +final class Trie { + private final BytesTrieBuilder trieBuilder = new BytesTrieBuilder(); + private final byte[] spanBytes = new byte[24]; + + /** + * Represents a sequence of prefixes to be added to the underlying Trie + * when a value is specified. + * + *

The position of a span cannot be modified, but they are not thread + * safe (since they share the same underlying buffer). + */ + final class Span { + // The index *after* the last prefix was added. + private final int index; + + // The root span. + private Span() { + this.index = 0; + } + + // An extended span with the given prefix included. + private Span(int index, String prefix) { + checkArgument(index >= 0, "bad index: %s", index); + checkState(!prefix.isEmpty(), "invalid subtag: %s", prefix); + checkState(index + prefix.length() <= spanBytes.length, "span too long"); + if (prefix.equals("*")) { + spanBytes[index++] = '*'; + } else { + checkArgument(!prefix.contains("*"), "prefix must not contain '*': %s", prefix); + for (int i = 0; i < prefix.length(); i++) { + char c = prefix.charAt(i); + checkArgument(c < LocaleDistance.END_OF_SUBTAG, "invalid trie character: %s", c); + spanBytes[index++] = (byte) c; + } + // Mark the final character as a terminator to avoid overlap matches. + spanBytes[index - 1] |= (byte) LocaleDistance.END_OF_SUBTAG; + } + this.index = index; + } + + /** + * Extends the current span by creating a new span with the given ASCII + * prefix data, and passing it to the given consumer. The original span is + * not modified, but must not be used again until the consumer is finished. + * + *

The prefix string must contain only 7-bit ASCII characters. + */ + public void with(String prefix, Consumer withFn) { + withFn.accept(new Span(index, prefix)); + } + + /** + * Commits the current prefix data and the given value to the underlying Trie. + */ + public void putPrefixAndValue(int value) { + checkArgument(value >= 0, "bad trie value: %s", value); + checkState(index > 0, "missing prefix for value: %s", value); + trieBuilder.add(spanBytes, index, value); + } + } + + /** Returns the root span with no current prefix data. */ + public Span root() { + return new Span(); + } + + /** Serializes the underlying Trie data to a byte array (see also {@link BytesTrieBuilder}). */ + public byte[] toByteArray() { + ByteBuffer buffer = trieBuilder.buildByteBuffer(BytesTrieBuilder.Option.SMALL); + byte[] bytes = new byte[buffer.remaining()]; + buffer.get(bytes); + return bytes; + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/AbstractPathValueMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/AbstractPathValueMapper.java index b66b7d1c9bf..becf9d1d830 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/AbstractPathValueMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/AbstractPathValueMapper.java @@ -62,8 +62,8 @@ abstract class AbstractPathValueMapper { final void addIcuData(IcuData icuData) { // This subclass mostly exists to control the fact that results need to be added in one go // to the IcuData because of how referenced paths are handled. If results could be added in - // multiple passes, you could have confusing situations in which values has path references - // in them but the referenced paths have not been transformed yet. Forcing the subclass to + // multiple passes, you could have confusing situations in which values have path references + // in them, but the referenced paths have not been transformed yet. Forcing the subclass to // implement a single method to generate all results at once ensures that we control the // lifecycle of the data and how results are processed as they are added to the IcuData. checkState(resultsByRbPath.isEmpty(), diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTableTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTableTest.java new file mode 100644 index 00000000000..a9d6bd2d4d9 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/DistanceTableTest.java @@ -0,0 +1,164 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.truth.Truth.assertThat; +import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows; + +import org.junit.Test; + +import com.google.common.collect.ImmutableMap; +import com.ibm.icu.util.BytesTrie; + +// NOTE: Remember that here, "region" is synonymous with a "partition group ID". +public class DistanceTableTest { + @Test + public void testSimpleMapping() { + DistanceTable.Builder builder = defaultTable(); + // You need at least one non default mapping. + builder.addDistance(23, true, "en", "en"); + DistanceTable table = builder.build(); + assertThat(getTrieTable(table)).containsExactly("en-en", 23); + assertThat(table.getDefaultDistances()).asList().containsExactly(80, 50, 4, 4).inOrder(); + } + + @Test + public void testReverseMapping() { + DistanceTable.Builder builder = defaultTable(); + // You need at least one non default mapping. + builder.addDistance(1, false, "no", "nb"); + DistanceTable table = builder.build(); + assertThat(getTrieTable(table)) + .containsExactly( + "nb-no", 1, + "no-nb", 1) + .inOrder(); + } + + @Test + public void testMinRegionDistance() { + DistanceTable.Builder builder = defaultTable(); + // You need at least one non default mapping. + builder.addDistance(2, true, "zh", "zh", "Hant", "Hant", "1", "1"); + builder.addDistance(4, true, "zh", "zh", "Hant", "Hant", "2", "2"); + builder.addDistance(6, true, "zh", "zh", "Hant", "Hant", "*", "*"); + DistanceTable table = builder.build(); + assertThat(getTrieTable(table)) + .containsExactly( + // Inferred mappings for "parent" locales. + "zh-zh", 0, // Equal locales have zero distance. + "zh-zh-*-*", 50, // Default unknown script distance + "zh-zh-Hant-Hant", 0, + // Trie ordering prefers "*" mapping at the front. + "zh-zh-Hant-Hant-*-*", 6, + "zh-zh-Hant-Hant-1-1", 2, + "zh-zh-Hant-Hant-2-2", 4) + .inOrder(); + // Minimum region distance is recorded successfully (last value). + assertThat(table.getDefaultDistances()).asList().containsExactly(80, 50, 4, 2).inOrder(); + } + + @Test + public void testSkipScript() { + DistanceTable.Builder builder = defaultTable(); + // You need at least one non default mapping. + builder.addDistance(2, true, "en", "en", "*", "*", "1", "1"); + builder.addDistance(4, true, "en", "en", "*", "*", "*", "*"); + DistanceTable table = builder.build(); + assertThat(getTrieTable(table)) + .containsExactly( + // "en-en" is marked for "skip script" so the remaining "en-en-..." + // mappings are correctly interpretted as "language-region". + "en-en", 128, + "en-en-*-*", 4, + "en-en-1-1", 2) + .inOrder(); + } + + @Test + public void testFirstOneWins() { + DistanceTable.Builder builder = defaultTable(); + // Duplicate mappings are only expected for "region" where different rules can + // produce duplicate mappings by virtue of having non-disjoint region partitions. + builder.addDistance(2, true, "en", "en", "*", "*", "1", "1"); + builder.addDistance(4, true, "en", "en", "*", "*", "1", "1"); // ignored + builder.addDistance(6, true, "en", "en", "*", "*", "*", "*"); + DistanceTable table = builder.build(); + assertThat(getTrieTable(table)) + .containsExactly( + "en-en", 128, + "en-en-*-*", 6, + "en-en-1-1", 2) + .inOrder(); + } + + @Test + public void testBadDistance() { + IllegalArgumentException e = assertThrows( + IllegalArgumentException.class, + () -> defaultTable().addDistance(123, true, "en", "fr")); + assertThat(e).hasMessageThat().contains("distance"); + assertThat(e).hasMessageThat().contains("123"); + } + + @Test + public void testBadParameters() { + IllegalArgumentException e = assertThrows( + IllegalArgumentException.class, + () -> defaultTable().addDistance(1, true, "en", "en", "*")); + assertThat(e).hasMessageThat().contains("invalid number of arguments"); + } + + @Test + public void testBadKeys() { + IllegalArgumentException e = assertThrows( + IllegalArgumentException.class, + () -> defaultTable().addDistance(1, true, "en", "*")); + assertThat(e).hasMessageThat().contains("invalid mapping key"); + assertThat(e).hasMessageThat().contains("en"); + assertThat(e).hasMessageThat().contains("�"); + } + + private static DistanceTable.Builder defaultTable() { + DistanceTable.Builder table = DistanceTable.builder(); + // Defaults (which are necessary to add, but should always be trimmed from results). + // The actual distances don't matter (and are copied to the distance array). + table.addDistance(80, false, "*", "*"); + table.addDistance(50, false, "*", "*", "*", "*"); + table.addDistance(4, false, "*", "*", "*", "*", "*", "*"); + return table; + } + + @Test + public void testNoDefaultLanguage() { + // Don't get the default table, since we need to test without defaults. + DistanceTable.Builder builder = DistanceTable.builder(); + IllegalStateException e = assertThrows(IllegalStateException.class, builder::build); + assertThat(e).hasMessageThat().contains("missing default language"); + } + + @Test + public void testNoDefaultScript() { + // Don't get the default table, since we need to test without defaults. + DistanceTable.Builder builder = DistanceTable.builder(); + builder.addDistance(80, false, "*", "*"); + IllegalStateException e = assertThrows(IllegalStateException.class, builder::build); + assertThat(e).hasMessageThat().contains("missing default script"); + } + + @Test + public void testNoDefaultRegion() { + // Don't get the default table, since we need to test without defaults. + DistanceTable.Builder builder = DistanceTable.builder(); + builder.addDistance(80, false, "*", "*"); + builder.addDistance(50, false, "*", "*", "*", "*"); + IllegalStateException e = assertThrows(IllegalStateException.class, builder::build); + assertThat(e).hasMessageThat().contains("missing default region"); + } + + // VisibleForTesting + public ImmutableMap getTrieTable(DistanceTable table) { + // We rebuild the Trie from the byte[] data. + return TestData.getTrieTable(new BytesTrie(table.getTrie().toByteArray(), 0), "*-*", i -> i); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/IndexerTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/IndexerTest.java new file mode 100644 index 00000000000..668348632cf --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/IndexerTest.java @@ -0,0 +1,41 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.truth.Truth.assertThat; + +import org.junit.Test; + +import com.google.common.collect.ImmutableList; + +public class IndexerTest { + @Test + public void testSimple() { + Indexer indexer = Indexer.create(); + assertThat(indexer.apply("foo")).isEqualTo(0); + assertThat(indexer.apply("bar")).isEqualTo(1); + assertThat(indexer.apply("baz")).isEqualTo(2); + assertThat(indexer.apply("foo")).isEqualTo(0); + } + + @Test + public void testWithTransform() { + ImmutableList words = ImmutableList.of("ONE", "TWO", "THREE"); + Indexer indexer = Indexer.create(words::get); + assertThat(indexer.apply("foo")).isEqualTo("ONE"); + assertThat(indexer.apply("bar")).isEqualTo("TWO"); + assertThat(indexer.apply("baz")).isEqualTo("THREE"); + assertThat(indexer.apply("foo")).isEqualTo("ONE"); + + } + + @Test + public void getValues() { + Indexer indexer = Indexer.create(); + indexer.apply("foo"); + indexer.apply("bar"); + indexer.apply("baz"); + indexer.apply("bar"); + assertThat(indexer.getValues()).containsExactly("foo", "bar", "baz").inOrder(); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilderTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilderTest.java new file mode 100644 index 00000000000..8a83308f0ef --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LikelySubtagsBuilderTest.java @@ -0,0 +1,110 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.truth.Truth.assertThat; +import static java.util.Arrays.asList; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.OVERLONG; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.lsr; + +import org.junit.Test; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrValue; + +import com.google.common.collect.ImmutableMap; +import com.ibm.icu.impl.locale.LSR; +import com.ibm.icu.impl.locale.XLikelySubtags; +import com.ibm.icu.util.BytesTrie; + +public class LikelySubtagsBuilderTest { + + @Test + public void testLanguageAliases() { + XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData( + // Minimum mapping (or else code complains). + likelySubtag("und", "en_Latn_US"), + + alias(LANGUAGE, DEPRECATED, "in", "id"), + alias(LANGUAGE, DEPRECATED, "mo", "ro"), + // Overlong languages are ignored. + alias(LANGUAGE, OVERLONG, "eng", "en"), + // Non-simple languages with script, region or other extensions are ignored. + alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"), + alias(LANGUAGE, LEGACY, "i-default", "en-x-i-default"))); + + assertThat(subtags.languageAliases).containsExactly("in", "id", "mo", "ro"); + } + + @Test + public void testTerritoryAliases() { + XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData( + // Minimum mapping (or else code complains). + likelySubtag("und", "en_Latn_US"), + + // When more than one replacement exists, take the first. + alias(TERRITORY, DEPRECATED, "CS", "RS ME"), + alias(TERRITORY, DEPRECATED, "UK", "GB"), + // Overlong territories are ignored. + alias(TERRITORY, OVERLONG, "eng", "en"), + alias(TERRITORY, OVERLONG, "999", "ZZ"))); + + assertThat(subtags.regionAliases).containsExactly("CS", "RS", "UK", "GB"); + } + + @Test + public void testLikelySubtags() { + XLikelySubtags.Data subtags = LikelySubtagsBuilder.build(getTestData( + likelySubtag("und", "en_Latn_US"), + likelySubtag("en", "en_Latn_US"), + likelySubtag("pt", "pt_Latn_BR"), + likelySubtag("und_BR", "pt_Latn_BR"), + likelySubtag("zh", "zh_Hans_CN"), + likelySubtag("zh_TW", "zh_Hant_TW"), + likelySubtag("zh_Hant", "zh_Hant_TW"))); + + assertThat(subtags.lsrs).asList() + .containsExactly( + // Special cases (these should never change). + lsr(""), + lsr("skip-script"), + // Locales mapped to by the likely subtags mappings (in order). + lsr("en-Latn-US"), + lsr("pt-Latn-BR"), + lsr("zh-Hans-CN"), + lsr("zh-Hant-TW")) + .inOrder(); + + // Order is by "subtag" (left-to-right) with lexicographical order of tags (other + // than '*' which is always sorted first). + // Results are mapped to their corresponding value in the LSRs list. + assertThat(getTrieTable(subtags)) + .containsExactly( + "*-*-*", lsr("en-Latn-US"), + "*-*-BR", lsr("pt-Latn-BR"), + "*-Latn-*", lsr("en-Latn-US"), + "*-Latn-BR", lsr("pt-Latn-BR"), + "*-Latn-US", lsr("en-Latn-US"), + "en", lsr("en-Latn-US"), + "pt", lsr("pt-Latn-BR"), + "zh-*-*", lsr("zh-Hans-CN"), + "zh-*-TW", lsr("zh-Hant-TW"), + "zh-Hant", lsr("zh-Hant-TW")) + .inOrder(); + } + + private static ImmutableMap getTrieTable(XLikelySubtags.Data subtags) { + // We rebuild the Trie from the byte[] data. + return TestData.getTrieTable(new BytesTrie(subtags.trie, 0), "*", i -> subtags.lsrs[i]); + } + + private static CldrData getTestData(CldrValue... values) { + return CldrDataSupplier.forValues(asList(values)); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapperTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapperTest.java new file mode 100644 index 00000000000..e7e833cacaa --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/LocaleDistanceMapperTest.java @@ -0,0 +1,298 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.truth.Truth.assertThat; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.DEPRECATED; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.LEGACY; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasReason.MACRO; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.LANGUAGE; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.AliasType.TERRITORY; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.alias; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.cldrData; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.deprecatedTerritory; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.languageMatch; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.likelySubtag; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.matchVariable; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.paradigms; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGrouping; +import static org.unicode.icu.tool.cldrtoicu.testing.IcuDataSubjectFactory.assertThat; + +import java.io.ByteArrayOutputStream; +import java.util.List; + +import org.junit.Test; +import org.unicode.cldr.api.CldrData; +import org.unicode.icu.tool.cldrtoicu.IcuData; +import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.RbValue; + +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSetMultimap; +import com.ibm.icu.impl.locale.LSR; +import com.ibm.icu.util.BytesTrie; + +/** + * Higher level tests for {@link LocaleDistanceMapper} to demonstrate that CLDR values + * are matched and processed, and the IcuData is written as expected. + * + *

Most of the separate parts which make up this mapper are already tested at a + * lower level in the other tests in this package. + */ +public class LocaleDistanceMapperTest { + @Test + public void testEndToEnd() { + // Language match elements are ordered, so need an incrementing sort index. + int idx = 0; + + // A representative subset of CLDR data needed to generate the locale distance. + // This focuses on two distinct cases: + // 1: American vs non-American and British English + // This demonstrates the way that special case mappings are handled. + // 2: Chinese, Simplified and Traditional + // This demonstrates languages with multiple scripts. + CldrData testData = cldrData( + paradigms("en", "en_GB", "es", "es_419"), + matchVariable("$enUS", "PR+US+VI"), + matchVariable("$cnsar", "HK+MO"), + + // The element is marked "ORDERED" in the DTD, so + // ordering of match rules can can affect output (when paths are + // otherwise equal). DTD ordering will not re-order this data. + languageMatch("yue", "zh", 10, true, ++idx), + languageMatch("*", "*", 80, false, ++idx), + + languageMatch("zh_Hans", "zh_Hant", 15, true, ++idx), + languageMatch("zh_Hant", "zh_Hans", 19, true, ++idx), + languageMatch("zh_Latn", "zh_Hans", 20, true, ++idx), + languageMatch("*_*", "*_*", 50, false, ++idx), + + languageMatch("en_*_$enUS", "en_*_$enUS", 4, false, ++idx), + languageMatch("en_*_$!enUS", "en_*_GB", 3, false, ++idx), + languageMatch("en_*_$!enUS", "en_*_$!enUS", 4, false, ++idx), + languageMatch("en_*_*", "en_*_*", 5, false, ++idx), + + languageMatch("zh_Hant_$cnsar", "zh_Hant_$cnsar", 4, false, ++idx), + languageMatch("zh_Hant_$!cnsar", "zh_Hant_$!cnsar", 4, false, ++idx), + languageMatch("zh_Hant_*", "zh_Hant_*", 5, false, ++idx), + languageMatch("*_*_*", "*_*_*", 4, false, ++idx), + + // NOTE: This is deliberately NOT in DTD order to demonstrate that the + // mapper will reorder these (putting "und" last) which means that the + // ICU data here is NOT affected by changes in the likely subtag order). + likelySubtag("und", "en_Latn_US"), + likelySubtag("und_HK", "zh_Hant_HK"), + likelySubtag("und_MO", "zh_Hant_MO"), + likelySubtag("und_TW", "zh_Hant_TW"), + likelySubtag("und_030", "zh_Hans_CN"), + likelySubtag("und_142", "zh_Hans_CN"), + likelySubtag("und_CN", "zh_Hans_CN"), + likelySubtag("und_Hans", "zh_Hans_CN"), + likelySubtag("und_Hant", "zh_Hant_TW"), + likelySubtag("zh", "zh_Hans_CN"), + likelySubtag("zh_Hant", "zh_Hant_TW"), + likelySubtag("zh_TW", "zh_Hant_TW"), + + // NOT in DTD order (to demonstrate order invariance later). + alias(LANGUAGE, LEGACY, "zh_SG", "zh_Hans_SG"), + alias(LANGUAGE, LEGACY, "zh_HK", "zh_Hant_HK"), + alias(LANGUAGE, LEGACY, "zh_TW", "zh_Hant_TW"), + alias(LANGUAGE, LEGACY, "zh_MO", "zh_Hant_MO"), + alias(LANGUAGE, LEGACY, "zh_CN", "zh_Hans_CN"), + alias(LANGUAGE, MACRO, "cmn", "zh"), + + // NOT in DTD order (to demonstrate order invariance later). + alias(TERRITORY, DEPRECATED, "UK", "GB"), + alias(TERRITORY, DEPRECATED, "AN", "CW", "SX", "BQ"), + + // Rather trimmed down containment hierarchy. It still retains macro + // regions and grouping to demonstrate that these work as expected. + territoryGroup("001", "019", "142", "150"), // World + territoryGrouping("001", "EU"), + territoryGroup("019", "021", "419"), // Americas + territoryGroup("142", "030", "035"), // Asia + territoryGroup("150", "154", "155"), // Europe + territoryGrouping("EU", "DE", "FR", "IE"), // European Union (no CH or GB) + territoryGroup("021", "CA", "PM", "US"), // Northern America + territoryGroup("419", "013", "029"), // Latin America and the Caribbean + territoryGroup("030", "CN", "HK", "MO", "TW"), // Eastern Asia + territoryGroup("035", "PH", "SG", "TH", "VN"), // South-Eastern Asia + territoryGroup("154", "GB", "IE"), // Northern Europe + territoryGroup("155", "CH", "DE", "FR"), // Western Europe + territoryGroup("013", "CR", "MX", "PA"), // Central America + territoryGroup("029", "BQ", "CW", "PR", "SX", "VI"), // Caribbean + deprecatedTerritory("029", "AN")); // Antilles (=> BQ, CW, SX) + + IcuData icuData = LocaleDistanceMapper.process(testData); + // Aliases come in (deprecated, replacement) pairs. + assertThat(icuData).hasValuesFor("likely/languageAliases", "cmn", "zh"); + assertThat(icuData).hasValuesFor("likely/regionAliases", "AN", "CW", "UK", "GB"); + + // LSR values come in (language, script, region) tuples. They are the mapped-to + // values for the likely subtag mappings, ordered by the DTD order in which the + // mapping keys were encountered. + assertThat(icuData).hasValuesFor("likely/lsrs", + "", "", "", + "skip", "script", "", + "zh", "Hans", "CN", + "zh", "Hant", "TW", + "en", "Latn", "US", + "zh", "Hant", "HK", + "zh", "Hant", "MO"); + + // It's a bit easier to see how match keys are grouped against the partitions. + ImmutableSetMultimap likelyTrie = + getTrieMap(icuData, "likely/trie:bin", "*").asMultimap().inverse(); + + // Special values in the lookup table don't map from any locales directly. + assertThat(likelyTrie).valuesForKey(0).isEmpty(); + assertThat(likelyTrie).valuesForKey(1).isEmpty(); + + // Index 4: en-Latn-US (the general default and default for Latn). + assertThat(likelyTrie).valuesForKey(4).containsExactly("*-Latn-*", "*-Latn-US", "*-*-*"); + + // Index 2: zh-Hans-CN (default for zh, Hans and CN separately). + assertThat(likelyTrie).valuesForKey(2).containsExactly( + "*-*-030", "*-*-142", // macro regions + "*-*-CN", "*-Hans-*", "*-Hans-CN", // unknown language match + "cmn-*-*", // language alias + "zh-*-*"); // default for language + + // Index 2: zh-Hant-TW (default for zh if Hant or TW is given). + assertThat(likelyTrie).valuesForKey(3).containsExactly( + "*-*-TW", "*-Hant-*", "*-Hant-TW", // unknown language match + "cmn-*-TW", "cmn-Hant", // language alias with specific script/region + "zh-*-TW", "zh-Hant"); // default for script/region + + // Other zh languages (zh-Hant-HK, zh-Hant-MO) require an explicit region match. + assertThat(likelyTrie).valuesForKey(5).containsExactly("*-*-HK", "*-Hant-HK"); + assertThat(likelyTrie).valuesForKey(6).containsExactly("*-*-MO", "*-Hant-MO"); + + // Pairs of expanded paradigm locales (using LSR tuples) in declaration order. + // This is just the list from the CLDR data with no processing. + assertThat(icuData).hasValuesFor("match/paradigms", + "en", "Latn", "US", + "en", "Latn", "GB", + "es", "Latn", "ES", + "es", "Latn", "419"); + + // See PartitionInfoTest for a description of the ordering of these strings. + assertThat(icuData).hasValuesFor("match/partitions", + ".", "0", "1", "2", "3", "0123", "03", "02", "01"); + + ImmutableMap matchTrie = getTrieMap(icuData, "match/trie:bin", "*-*"); + byte[] regionLookup = getBytes(icuData, "match/regionToPartitions:bin"); + ImmutableList partitions = + icuData.get(RbPath.parse("match/partitions")).get(0).getElements(); + + // Test defaults have been trimmed. + assertThat(matchTrie).doesNotContainKey("*-*"); + assertThat(matchTrie).doesNotContainKey("*-*-*-*"); + assertThat(matchTrie).doesNotContainKey("*-*-*-*-*-*"); + + // Some zh specific tests. + assertThat(matchTrie).containsEntry("yue-zh", 10); // Encapsulated language + assertThat(matchTrie).containsEntry("zh-zh-Hant-Hant-*-*", 5); + + // Special marker that means "en-en" matches don't use script information. + // This is assumed in the distance tests below, so it's important to check. + assertThat(matchTrie).containsEntry("en-en", 128); + + // British English is a slightly better match against non-American English. + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "GB", 3); + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "GB", 3); + // "EU" works here because while it's a macro region, in this data it only + // covers a single partition. + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "EU", 3); + + // Pairs of non-American or American English languages get a larger distance. + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "DE", 4); + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "PR", 4); + // Deprecated regions (AN) are still mapped to partitions and get real distances. + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "AN", "TW", 4); + + // Mixing American and non-American English gets the default "en-en-*-*" distance. + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "GB", "US", 5); + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "CA", "US", 5); + assertEnDistanceForRegions(matchTrie, regionLookup, partitions, "US", "AN", 5); + + // Default distances for language, script and region, plus minimum region distance. + // Minimum region distance is "en_*_$!enUS" -> "en_*_GB" (as seen above). + assertThat(icuData).hasValuesFor("match/distances:intvector", "80", "50", "4", "3"); + } + + // Helper to make assertions about language distance a bit more readable. + // PartitionInfoTest includes more low level tests for precise ordering etc. + private static void assertEnDistanceForRegions( + ImmutableMap matchTrie, + byte[] regionLookup, + ImmutableList paritions, + String regionA, String regionB, + int distance) { + // Three step lookup for each region: + // 1: Find LSR index from region string. + // 2: Lookup partition group index from region lookup table. + // 3: Lookup partition group string from partitions table. + String partitionA = paritions.get(regionLookup[LSR.indexForRegion(regionA)]); + String partitionB = paritions.get(regionLookup[LSR.indexForRegion(regionB)]); + + // For now only support cases where there's a single partition ID associated + // with the region (this is all non-macro regions and *some* macro regions). + checkArgument(partitionA.length() == 1 && partitionB.length() == 1, + "multiple partitions unsupported in test: %s %s", regionA, regionB); + + // This is a depth 2 key because we know that "en" skips scripts. This will + // not work the same for "zh" because that needs scripts information. + String key = String.format("en-en-%s-%s", partitionA, partitionB); + if (matchTrie.containsKey(key)) { + assertThat(matchTrie).containsEntry(key, distance); + } else { + assertThat(matchTrie).containsEntry("en-en-*-*", distance); + } + } + + // Returns the mapping for a Trie from a ":bin" suffixed resource value. + // "star" defines what the Trie wildcard should be expanded to (for readability). + private static ImmutableMap getTrieMap(IcuData icuData, String path, String star) { + return TestData.getTrieTable(getTrie(icuData, path), star, i -> i); + } + + // Reads a Trie from a ":bin" suffixed resource value. + private static BytesTrie getTrie(IcuData icuData, String path) { + return new BytesTrie(getBytes(icuData, path), 0); + } + + // Reads a byte array from a ":bin" suffixed resource value. + private static byte[] getBytes(IcuData icuData, String path) { + RbPath rbPath = RbPath.parse(path); + checkArgument(rbPath.isBinPath(), "only binary paths (:bin) should have binary data: %s", path); + List rbValues = icuData.get(rbPath); + checkArgument(rbValues != null, "missing value for: %s", rbPath); + checkArgument(rbValues.size() == 1, "expect single RbValue: %s", rbValues); + // Take a sequence of hex-strings, convert each to a byte[] and collect them. + return rbValues.get(0).getElements().stream() + .map(LocaleDistanceMapperTest::decodeHex) + .collect( + ByteArrayOutputStream::new, + (out, b) -> out.write(b, 0, b.length), + (out, b) -> out.write(b.toByteArray(), 0, b.size())) + .toByteArray(); + } + + // Hex chars to byte array (2 chars per byte, little endian). + private static byte[] decodeHex(String s) { + checkArgument(s.length() % 2 == 0, "binary hex strings must have an even length: %s", s); + checkArgument(HEX.matchesAllOf(s), "invalid binary hex string: %s", s); + byte[] bytes = new byte[s.length() / 2]; + for (int n = 0; n < bytes.length; n++) { + bytes[n] = (byte) Integer.parseUnsignedInt(s.substring(2 * n, 2 * (n + 1)), 16); + } + return bytes; + } + + private static final CharMatcher HEX = CharMatcher.anyOf("0123456789abcdefABCDEF"); +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfoTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfoTest.java new file mode 100644 index 00000000000..010aa49c0a0 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/PartitionInfoTest.java @@ -0,0 +1,91 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.truth.Truth.assertThat; +import static java.util.Arrays.asList; + +import org.junit.Test; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrValue; + +import com.ibm.icu.impl.locale.LSR; + +public class PartitionInfoTest { + @Test + public void testPartitionInfo() { + TerritoryContainment territories = territories( + TestData.territoryGroup("001", "019", "150"), + // Americas (simplified): North America + Caribbean + TestData.territoryGroup("019", "003", "029"), + TestData.territoryGroup("003", "CA", "US"), + TestData.territoryGroup("029", "PR", "VI"), + // Sort of Europe + TestData.territoryGroup("150", "DE", "FR", "GB")); + PartitionInfo.Builder builder = PartitionInfo.builder(territories); + // "American English" associated with U.S.A and Puerto Rico. + builder.addVariableExpression("$enUS", "US+PR"); + // The "Americas" form a different language grouping. + builder.addVariableExpression("$americas", "019"); + // Also register a separate variable for just the GB region code. + builder.ensureVariable("GB"); + + // In terms of "partitions" (which are assigned in sorted region code order) + // we should now have: + // + // CA, VI -> { $americas, $!enUS } == "0" + // DE, FR -> { $!americas, $!enUS } == "1" + // GB -> { $!americas, $!enUS, $GB } == "2" + // PR, US -> { $americas, $enUS } == "3" + // + // So reversing this to map variables to the partitions they overlap with: + // "$enUS" -> { "3" } + // "$!enUS" -> { "0", "1", "2" } + // "$americas" -> { "0", "3" } + // "$!americas" -> { "1", "2" } + // "$GB" -> { "2" } + PartitionInfo info = builder.build(); + assertThat(info.getPartitionIds("$enUS")).containsExactly("3"); + assertThat(info.getPartitionIds("$!enUS")).containsExactly("0", "1", "2"); + assertThat(info.getPartitionIds("$americas")).containsExactly("0", "3"); + assertThat(info.getPartitionIds("$!americas")).containsExactly("1", "2"); + assertThat(info.getPartitionIds("$GB")).containsExactly("2"); + + // Partition strings are made up of the explicit partition IDs. + // Indices are also assigned in first encountered region code order. + assertThat(info.getPartitionStrings()).asList().containsExactly( + // Default (unmapped) special case must be first. + ".", // ?? : index=0 + // Partitions IDs for "leaf" regions (only one partition per region). + "0", // CA, VI : index=1 + "1", // DE, FR : index=2 + "2", // GB : index=3 + "3", // PR, US : index=4 + // Macros regions include paritions of all overlapping regions. + "0123", // 001 : index=5 + "03", // 003, 019, 029 : index=6 + "12") // 150 : index=7 + .inOrder(); + + // The partition lookup array maps regions to the index of their partition string. + byte[] lookup = info.getPartitionLookupArray(); + assertThat(lookup[LSR.indexForRegion("CA")]).isEqualTo(1); + assertThat(lookup[LSR.indexForRegion("VI")]).isEqualTo(1); + assertThat(lookup[LSR.indexForRegion("DE")]).isEqualTo(2); + assertThat(lookup[LSR.indexForRegion("FR")]).isEqualTo(2); + assertThat(lookup[LSR.indexForRegion("GB")]).isEqualTo(3); + assertThat(lookup[LSR.indexForRegion("PR")]).isEqualTo(4); + assertThat(lookup[LSR.indexForRegion("US")]).isEqualTo(4); + assertThat(lookup[LSR.indexForRegion("001")]).isEqualTo(5); + assertThat(lookup[LSR.indexForRegion("003")]).isEqualTo(6); + assertThat(lookup[LSR.indexForRegion("019")]).isEqualTo(6); + assertThat(lookup[LSR.indexForRegion("029")]).isEqualTo(6); + assertThat(lookup[LSR.indexForRegion("150")]).isEqualTo(7); + // Unknown regions map to index 0. + assertThat(lookup[LSR.indexForRegion("JP")]).isEqualTo(0); + } + + private static TerritoryContainment territories(CldrValue... tcs) { + return TerritoryContainment.getContainment(CldrDataSupplier.forValues(asList(tcs))); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainmentTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainmentTest.java new file mode 100644 index 00000000000..940cad85a8e --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TerritoryContainmentTest.java @@ -0,0 +1,70 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.truth.Truth.assertThat; +import static java.util.Arrays.asList; +import static org.unicode.icu.tool.cldrtoicu.localedistance.TestData.territoryGroup; +import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows; + +import org.junit.Test; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrValue; + +public class TerritoryContainmentTest { + + @Test + public void testSimple() { + CldrData testData = getTestData( + territoryGroup("001", "002", "003"), + territoryGroup("002", "GB", "FR"), + territoryGroup("003", "US", "CA")); + TerritoryContainment containment = TerritoryContainment.getContainment(testData); + assertThat(containment.getMacroRegions()).containsExactly("001", "002", "003").inOrder(); + assertThat(containment.getLeafRegions()).containsExactly("CA", "FR", "GB", "US").inOrder(); + assertThat(containment.getLeafRegionsOf("002")).containsExactly("FR", "GB").inOrder(); + assertThat(containment.getLeafRegionsOf("GB")).isEmpty(); + } + + @Test + public void testOverlappingContainment() { + CldrData testData = getTestData( + territoryGroup("001", "002", "003", "004"), + territoryGroup("002", "GB", "FR"), + territoryGroup("003", "US", "CA"), + territoryGroup("004", "CA", "GB")); + TerritoryContainment containment = TerritoryContainment.getContainment(testData); + assertThat(containment.getLeafRegions()).containsExactly("CA", "FR", "GB", "US").inOrder(); + assertThat(containment.getLeafRegionsOf("002")).containsExactly("FR", "GB").inOrder(); + assertThat(containment.getLeafRegionsOf("004")).containsExactly("CA", "GB").inOrder(); + } + + @Test + public void testMultipleRootsFails() { + CldrData testData = getTestData( + territoryGroup("001", "002"), + territoryGroup("002", "GB", "FR"), + territoryGroup("003", "US", "CA")); + IllegalArgumentException err = + assertThrows(IllegalArgumentException.class, () -> TerritoryContainment.getContainment(testData)); + assertThat(err).hasMessageThat().contains("001"); + assertThat(err).hasMessageThat().contains("003"); + assertThat(err).hasMessageThat().doesNotContain("002"); + } + + @Test + public void testCyclicGraphFails() { + CldrData testData = getTestData( + territoryGroup("001", "002"), + territoryGroup("002", "001")); + IllegalArgumentException err = + assertThrows(IllegalArgumentException.class, () -> TerritoryContainment.getContainment(testData)); + assertThat(err).hasMessageThat().contains("world region"); + assertThat(err).hasMessageThat().contains("001"); + } + + private static CldrData getTestData(CldrValue... values) { + return CldrDataSupplier.forValues(asList(values)); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TestData.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TestData.java new file mode 100644 index 00000000000..53b41d47dab --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TestData.java @@ -0,0 +1,158 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Arrays.asList; + +import java.util.List; + +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrValue; + +import com.google.common.base.Ascii; +import com.google.common.base.Function; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableMap; +import com.ibm.icu.impl.locale.LSR; +import com.ibm.icu.util.BytesTrie; + +/** + * Utilities for easily generating test data for the LocaleDistanceMapper tests. + */ +final class TestData { + /** + * Returns an LSR from a locale ID pattern (e.g. "und", "zh-Hant", "en-*-GB"). + * This is definitely not a general locale parser! + */ + static LSR lsr(String s) { + List parts = Splitter.on('-').splitToList(s); + checkArgument(parts.size() <= 3); + return new LSR( + parts.get(0), + parts.size() > 1 ? parts.get(1) : "", + parts.size() > 2 ? parts.get(2) : "", + LSR.DONT_CARE_FLAGS); + } + + enum AliasType { LANGUAGE, TERRITORY } + + enum AliasReason { DEPRECATED, OVERLONG, LEGACY, MACRO } + + /** Returns CLDR data for the given values. */ + static CldrData cldrData(CldrValue... values) { + return CldrDataSupplier.forValues(asList(values)); + } + + /** Returns a CldrValue for a {@code } element. */ + static CldrValue paradigms(String... values) { + return supplemental( + "languageMatching/languageMatches[@type=\"written_new\"]/" + + "paradigmLocales[@locales=\"%s\"]", + String.join(" ", values)); + } + + /** Returns a CldrValue for a {@code } element. */ + static CldrValue matchVariable(String id, String value) { + return supplemental( + "languageMatching/languageMatches[@type=\"written_new\"]/" + + "matchVariable[@id=\"%s\"][@value=\"%s\"]", + id, value); + } + + /** Returns a CldrValue for a {@code } element. */ + static CldrValue languageMatch( + String desired, String supported, int distance, boolean oneway, int sort) { + return supplemental( + "languageMatching/languageMatches[@type=\"written_new\"]/" + + "languageMatch[@_q=\"%d\"][@desired=\"%s\"][@supported=\"%s\"][@distance=\"%d\"]%s", + sort, desired, supported, distance, oneway ? "[@oneway=\"true\"]" : ""); + } + + /** Returns a CldrValue for either a {@code } or {@code } element. */ + static CldrValue alias(AliasType type, AliasReason reason, String value, String... replacement) { + return supplemental( + "metadata/alias/%sAlias[@type=\"%s\"][@replacement=\"%s\"][@reason=\"%s\"]", + lower(type), value, String.join(" ", replacement), lower(reason)); + } + + /** Returns a CldrValue for either a {@code } element. */ + static CldrValue likelySubtag(String from, String to) { + return supplemental( + "likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]", + from, to); + } + + /** Returns a CldrValue for a {@code } group element. */ + static CldrValue territoryGroup(String region, String... subregions) { + return supplemental( + "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"]", + region, String.join(" ", subregions)); + } + + /** + * Returns a CldrValue for a {@code } group element where + * {@code @status="group"}. + */ + static CldrValue territoryGrouping(String region, String... subregions) { + return supplemental( + "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"][@status=\"group\"]", + region, String.join(" ", subregions)); + } + + /** + * Returns a CldrValue for a {@code } group element where + * {@code @status="deprecated"}. + */ + static CldrValue deprecatedTerritory(String region, String... subregions) { + return supplemental( + "territoryContainment/group[@type=\"%s\"][@contains=\"%s\"][@status=\"deprecated\"]", + region, String.join(" ", subregions)); + } + + /** + * Returns a map from expanded Trie keys to mapped value. This is useful in allowing + * tests to use human readable data when testing Tries. + * + * @param star a string representing the Trie wildcard in the output keys, which for + * readability differs between use cases (e.g. "*" for subtags and "*-*" + * for match rules). + * @param fn a function to map the actual Trie value to a more readable value for + * testing. + */ + static ImmutableMap getTrieTable(BytesTrie trie, String star, Function fn) { + // Mostly copied from LocaleDistance (since the necessary constructor is private). + // Main change is the this no longer uses a TreeMap, since we want to test order. + ImmutableMap.Builder map = ImmutableMap.builder(); + StringBuilder sb = new StringBuilder(); + for (BytesTrie.Entry entry : trie) { + sb.setLength(0); + int length = entry.bytesLength(); + for (int i = 0; i < length; ++i) { + byte b = entry.byteAt(i); + if (b == '*') { + sb.append(star).append('-'); + } else if (b >= 0) { + sb.append((char) b); + } else { // end of subtag (high bit set) + sb.append((char) (b & 0x7f)).append('-'); + } + } + assert sb.length() > 0 && sb.charAt(sb.length() - 1) == '-'; + sb.setLength(sb.length() - 1); + map.put(sb.toString(), fn.apply(entry.value)); + } + return map.build(); + } + + private static CldrValue supplemental(String path, Object... args) { + return CldrValue.parseValue(String.format("//supplementalData/" + path, args), ""); + } + + private static String lower(Enum value) { + return Ascii.toLowerCase(value.name()); + } + + private TestData() {} +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TrieTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TrieTest.java new file mode 100644 index 00000000000..7efcdbdaaf0 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/localedistance/TrieTest.java @@ -0,0 +1,163 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package org.unicode.icu.tool.cldrtoicu.localedistance; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.truth.Truth.assertThat; +import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows; + +import java.util.LinkedHashMap; +import java.util.Map; + +import org.junit.Test; + +import com.google.common.collect.ImmutableMap; +import com.ibm.icu.util.BytesTrie; + +public class TrieTest { + @Test + public void testSimple() { + Trie trie = new Trie(); + trie.root().with("answer", t -> t.putPrefixAndValue(42)); + assertThat(getRawTrieTable(trie.toByteArray())).containsExactly("answer", 42); + } + + @Test + public void testSubSpan() { + Trie trie = new Trie(); + trie.root().with("foo", foo -> foo.with("bar", fooBar -> fooBar.putPrefixAndValue(42))); + assertThat(getRawTrieTable(trie.toByteArray())).containsExactly("foo-bar", 42); + } + + @Test + public void testHierarchyAndOrdering() { + Trie trie = new Trie(); + trie.root().with("foo", foo -> { + foo.with("two", sub -> sub.putPrefixAndValue(3)); + foo.with("one", sub -> sub.putPrefixAndValue(2)); + foo.with("*", sub -> sub.putPrefixAndValue(1)); + }); + trie.root().with("bar", bar -> bar.with("baz", baz -> baz.with("quux", quux -> quux.putPrefixAndValue(0)))); + + // Order is by "subtag" (left-to-right) with lexicographical order of tags (other + // than '*' which is always sorted first). + assertThat(getRawTrieTable(trie.toByteArray())) + .containsExactly( + "bar-baz-quux", 0, + "foo-*", 1, + "foo-one", 2, + "foo-two", 3) + .inOrder(); + } + + @Test + public void testStarOrdering() { + Trie trie = new Trie(); + // Use '$' which has a lower byte value that '*' in ASCII, but when it terminates a prefix, + // it has bit-7 set which makes it sort higher than '*'. + // In other tests it's not clear that '*' is sorted specially since '*' < [a-z] anyway. + trie.root().with("$", foo -> { + // A single '$' sorts after '*' because '$' will have bit-7 set, and '*' will not. + foo.with("$", sub -> sub.putPrefixAndValue(5)); + // '$$' sorts below * because the leading '$' won't have bit-7 set. + foo.with("$$", sub -> sub.putPrefixAndValue(3)); + foo.with("*", sub -> sub.putPrefixAndValue(4)); + }); + trie.root().with("*", foo -> { + foo.with("$", sub -> sub.putPrefixAndValue(2)); + foo.with("*", sub -> sub.putPrefixAndValue(1)); + }); + trie.root().with("*", sub -> sub.putPrefixAndValue(0)); + + // Star is definitely sorted before other entries. + assertThat(getRawTrieTable(trie.toByteArray())) + .containsExactly( + "*", 0, + "*-*", 1, + "*-$", 2, + "$-$$", 3, + "$-*", 4, + "$-$", 5) + .inOrder(); + } + + @Test + public void testBadTrie_BadValue() { + Trie trie = new Trie(); + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> trie.root().with("foo", t -> t.putPrefixAndValue(-1))); + assertThat(e).hasMessageThat().contains("bad trie value"); + assertThat(e).hasMessageThat().contains("-1"); + } + + @Test + public void testBadTrie_NoPrefix() { + Trie trie = new Trie(); + IllegalStateException e = + assertThrows(IllegalStateException.class, () -> trie.root().putPrefixAndValue(23)); + assertThat(e).hasMessageThat().contains("missing prefix"); + assertThat(e).hasMessageThat().contains("23"); + } + + @Test + public void testBadTrie_BadPrefix() { + Trie trie = new Trie(); + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> trie.root().with("ümlaut", t -> t.putPrefixAndValue(0))); + assertThat(e).hasMessageThat().contains("invalid trie character"); + assertThat(e).hasMessageThat().contains("ü"); + } + + @Test + public void testBadTrie_NoStarInPrefix() { + Trie trie = new Trie(); + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> trie.root().with("foo*bar", t -> t.putPrefixAndValue(0))); + assertThat(e).hasMessageThat().contains("must not contain '*'"); + assertThat(e).hasMessageThat().contains("foo*bar"); + } + + @Test + public void testBadTrie_TooLong() { + Trie trie = new Trie(); + IllegalStateException e = + assertThrows(IllegalStateException.class, () -> infiniteRecursion(trie.root())); + assertThat(e).hasMessageThat().contains("span too long"); + } + + private static void infiniteRecursion(Trie.Span span) { + span.with("!", TrieTest::infiniteRecursion); + } + + private static ImmutableMap getRawTrieTable(byte[] data) { + // We rebuild the Trie from the byte[] data. + BytesTrie trie = new BytesTrie(data, 0); + + // Mostly copied from XLikelySubtags (since the necessary constructor is private). + // Main change is the this no longer uses a TreeMap, since we want to test order. + Map map = new LinkedHashMap<>(); + StringBuilder sb = new StringBuilder(); + for (BytesTrie.Entry entry : trie) { + sb.setLength(0); + int length = entry.bytesLength(); + for (int i = 0; i < length; i++) { + byte b = entry.byteAt(i); + sb.append((char) (b & 0x7f)); + if (b < 0 || b == '*') { + // end of subtag (high bit set or special '*' wildcard) + sb.append("-"); + } + } + checkState(sb.length() > 0 && sb.charAt(sb.length() - 1) == '-'); + sb.setLength(sb.length() - 1); + map.put(sb.toString(), entry.value); + } + return ImmutableMap.copyOf(map); + } +}