ICU-20693 New LDML to ICU tooling.

author David Beaumont <dbeaumont@google.com>

Sat, 24 Aug 2019 15:14:52 +0000 (15:14 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 27 Aug 2019 17:28:01 +0000 (10:28 -0700)
author David Beaumont <dbeaumont@google.com>
Sat, 24 Aug 2019 15:14:52 +0000 (15:14 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 27 Aug 2019 17:28:01 +0000 (10:28 -0700)
diff --git a/tools/cldr/cldr-to-icu/.gitignore b/tools/cldr/cldr-to-icu/.gitignore

new file mode 100644 (file)

index 0000000..8d31dd9
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/.gitignore
@@ -0,0 +1,7 @@
+# Exclude the Maven local repository but keep the lib directory and the top-level readme.
+/lib/**
+!/lib/README.txt
+
+# Ignore the default Maven target directory.
+/target
+
diff --git a/tools/cldr/cldr-to-icu/README.txt b/tools/cldr/cldr-to-icu/README.txt

new file mode 100644 (file)

index 0000000..647bf99
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/README.txt
@@ -0,0 +1,55 @@
+*********************************************************************
+*** © 2019 and later: Unicode, Inc. and others.                   ***
+*** License & terms of use: http://www.unicode.org/copyright.html ***
+*********************************************************************
+
+Basic instructions for running the LdmlConverter via Maven
+==========================================================
+
+Note that these instructions do not currently support configuration of the converter for things
+such as limiting the set of files produced. That is supported in code and could be easily added
+to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
+See the IcuConverterConfig class for the API by which this can be supported.
+
+
+Important directories
+---------------------
+
+<CLDR_DIR>  = The root directory of the CLDR release.
+
+<ICU_DIR>   = The root directory of the ICU release (probably a parent directory of where
+              this README file is located). This is an optional property and defaults to
+              the parent directory of the release from which it is run.
+
+<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
+              same directory as would be used when running tools from the CLDR project).
+              Note that the need to specify this directory is scheduled to be removed after
+              ICU release 65.
+
+<OUT_DIR>   = The output directory into which ICU data files should be written.
+
+
+Generating all ICU data
+-----------------------
+
+$ mvn exec:java \
+  -DCLDR_DIR='<CLDR_DIR>' \
+  -DCLDR_DTD_CACHE='<DTD_CACHE>' \
+  -Dexec.args='<OUT_DIR>'
+
+
+Running unit tests
+------------------
+
+$ mvn test \
+  -DCLDR_DIR='<CLDR_DIR>' \
+  -DCLDR_DTD_CACHE='<DTD_CACHE>'
+
+
+Importing and running from an IDE
+---------------------------------
+
+This project should be easy to import into an IDE which supports Maven development, such
+as IntelliJ or Eclipse. It uses a local Maven repository directory for the unpublished
+CLDR libraries (which are included in the project), but otherwise gets all dependencies
+via Maven's public repositories.
+\ No newline at end of file
diff --git a/tools/cldr/cldr-to-icu/lib/README.txt b/tools/cldr/cldr-to-icu/lib/README.txt

new file mode 100644 (file)

index 0000000..3e1db8e
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/lib/README.txt
@@ -0,0 +1,61 @@
+*********************************************************************
+*** © 2019 and later: Unicode, Inc. and others.                   ***
+*** License & terms of use: http://www.unicode.org/copyright.html ***
+*********************************************************************
+
+What is this directory and why is it empty?
+-------------------------------------------
+
+This is the root of a local Maven repository which needs to be populated before the
+code in this project can be executed.
+
+To do this, you need to have a local copy of the CLDR project configured on your
+computer and be able able to build the API jar file and copy an existing utility
+jar file. In the examples below it is assumed that <CLDR_ROOT> references this CLDR
+release.
+
+
+Regenerating the CLDR API jar
+-----------------------------
+
+To regenerate the CLDR API jar you need to build the "jar" target using the Ant
+build.xml file in the "tools/java" directory of the CLDR project:
+
+$ cd <CLDR_ROOT>/tools/java
+$ ant clean jar
+
+This should result in the cldr.jar file being built into that directory, which can then
+be installed as a Maven dependency as described above.
+
+
+Updating local Maven repository
+-------------------------------
+
+To update the local Maven repository (e.g. to install the CLDR jar) then from this
+directory (lib/) you should run:
+
+$ mvn install:install-file \
+  -DgroupId=org.unicode.cldr \
+  -DartifactId=cldr-api \
+  -Dversion=0.1-SNAPSHOT \
+  -Dpackaging=jar \
+  -DgeneratePom=true \
+  -DlocalRepositoryPath=. \
+  -Dfile=<CLDR_ROOT>/tools/java/cldr.jar
+
+And also (for the utility jar):
+
+$ mvn install:install-file \
+  -DgroupId=com.ibm.icu \
+  -DartifactId=icu-utilities \
+  -Dversion=0.1-SNAPSHOT \
+  -Dpackaging=jar \
+  -DgeneratePom=true \
+  -DlocalRepositoryPath=. \
+  -Dfile=<CLDR_ROOT>/tools/java/libs/utilities.jar
+
+And if you have updated one of these libraries, run:
+
+$ mvn dependency:purge-local-repository -DsnapshotsOnly=true
+
+If you choose to update the version number, then remember to update the root pom.xml.
diff --git a/tools/cldr/cldr-to-icu/pom.xml b/tools/cldr/cldr-to-icu/pom.xml

new file mode 100644 (file)

index 0000000..3c78843
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/pom.xml
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- © 2019 and later: Unicode, Inc. and others.
+     License & terms of use: http://www.unicode.org/copyright.html
+     See README.txt for instructions on updating the local repository.
+     -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.unicode.icu</groupId>
+    <artifactId>cldr-to-icu</artifactId>
+    <version>1.0-SNAPSHOT</version>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.5.1</version>
+                <configuration>
+                    <source>8</source>
+                    <target>8</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>exec-maven-plugin</artifactId>
+                <configuration>
+                    <mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
+                    <systemProperties>
+                        <property>
+                            <key>ICU_DIR</key>
+                            <value>${project.basedir}/../../..</value>
+                        </property>
+                    </systemProperties>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+    <!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
+    <repositories>
+        <repository>
+            <id>local-maven-repo</id>
+            <url>file:///${project.basedir}/lib</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.unicode.cldr</groupId>
+            <artifactId>cldr-api</artifactId>
+            <version>0.1-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu-utilities</artifactId>
+            <version>0.1-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>64.2</version>
+        </dependency>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>27.1-jre</version>
+        </dependency>
+        <dependency>
+            <groupId>com.google.truth</groupId>
+            <artifactId>truth</artifactId>
+            <version>1.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.truth.extensions</groupId>
+            <artifactId>truth-java8-extension</artifactId>
+            <version>1.0</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
+\ No newline at end of file
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java

new file mode 100644 (file)

index 0000000..f85c201
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java
@@ -0,0 +1,381 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrDraftStatus;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
+
+/**
+ * The converter config intended to generate the standard ICU data files. This used to be something
+ * that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
+ */
+public final class IcuConverterConfig implements LdmlConverterConfig {
+
+    private static final Optional<Path> DEFAULT_CLDR_DIR =
+        Optional.ofNullable(System.getProperty("CLDR_DIR", null))
+            .map(d -> Paths.get(d).toAbsolutePath());
+
+    private static final Optional<Path> DEFAULT_ICU_DIR =
+        Optional.ofNullable(System.getProperty("ICU_DIR", null))
+            .map(d -> Paths.get(d).toAbsolutePath());
+
+    /** The builder with which to specify configuration for the {@link LdmlConverter}. */
+    public static final class Builder {
+        private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
+        private Path outputDir =
+            DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
+        private Path specialsDir =
+            DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
+        private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
+        private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
+        private boolean emitReport = false;
+
+        /**
+         * Sets the CLDR base directory from which to load all CLDR data. This is optional if the
+         * {@code CLDR_DIR} environment variable is set, which will be used instead.
+         */
+        public Builder setCldrDir(Path cldrDir) {
+            this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
+            return this;
+        }
+
+        /**
+         * Sets the output directory in which the ICU data directories and files will go. This is
+         * optional if the {@code ICU_DIR} system property is set, which will be used to generate
+         * the path instead (i.e. {@code "icu4c/source/data"} inside the ICU release directory).
+         */
+        public Builder setOutputDir(Path outputDir) {
+            this.outputDir = checkNotNull(outputDir);
+            return this;
+        }
+
+        /**
+         * Sets the "specials" directory containing additional ICU specific data to be processed.
+         * This is optional if the {@code ICU_DIR} system property is set, which will be used to
+         * generate the path instead (i.e. {@code "icu4c/source/data/xml"} inside the ICU release
+         * directory).
+         */
+        public Builder setSpecialsDir(Path specialsDir) {
+            this.specialsDir = checkNotNull(specialsDir);
+            return this;
+        }
+
+        /**
+         * Sets the output types which will be converted. This is optional and defaults to {@link
+         * OutputType#ALL}.
+         */
+        public Builder setOutputTypes(Iterable<OutputType> types) {
+            this.outputTypes = ImmutableSet.copyOf(types);
+            return this;
+        }
+
+        /**
+         * Sets the minimum draft status for CLDR data to be converted (paths below this status are
+         * ignored during conversion). This is optional and defaults to {@link
+         * CldrDraftStatus#CONTRIBUTED}.
+         */
+        public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
+            this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
+            return this;
+        }
+
+        public Builder setEmitReport(boolean emitReport) {
+            this.emitReport = emitReport;
+            return this;
+        }
+
+        /** Returns a converter config from the current builder state. */
+        public LdmlConverterConfig build() {
+            return new IcuConverterConfig(this);
+        }
+    }
+
+    private final Path cldrDir;
+    private final Path outputDir;
+    private final Path specialsDir;
+    private final ImmutableSet<OutputType> outputTypes;
+    private final CldrDraftStatus minimalDraftStatus;
+    private final boolean emitReport;
+
+    private IcuConverterConfig(Builder builder) {
+        this.cldrDir = checkNotNull(builder.cldrDir,
+            "must set a CLDR directory, or the CLDR_DIR system property");
+        if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
+            System.err.format(
+                "Warning: Specified CLDR base directory does not appear to match the"
+                    + " directory inferred by the 'CLDR_DIR' system property.\n"
+                    + "Specified: %s\n"
+                    + "Inferred: %s\n",
+                this.cldrDir, DEFAULT_CLDR_DIR.get());
+        }
+        this.outputDir = checkNotNull(builder.outputDir);
+        checkArgument(!Files.isRegularFile(outputDir),
+            "specified output directory if not a directory: %s", outputDir);
+        this.specialsDir = checkNotNull(builder.specialsDir,
+            "must specify a 'specials' XML directory");
+        checkArgument(Files.isDirectory(specialsDir),
+            "specified specials directory does not exist: %s", specialsDir);
+        this.outputTypes = builder.outputTypes;
+        checkArgument(!this.outputTypes.isEmpty(),
+            "must specify at least one output type to be generated (possible values are: %s)",
+            Arrays.asList(OutputType.values()));
+        this.minimalDraftStatus = builder.minimalDraftStatus;
+        this.emitReport = builder.emitReport;
+    }
+
+    public static Builder builder() {
+        return new Builder();
+    }
+
+    @Override public Path getCldrDirectory() {
+        return cldrDir;
+    }
+
+    @Override public Path getOutputDir() {
+        return outputDir;
+    }
+
+    @Override public Set<OutputType> getOutputTypes() {
+        return outputTypes;
+    }
+
+    @Override public CldrDraftStatus getMinimumDraftStatus() {
+        return minimalDraftStatus;
+    }
+
+    @Override public Path getSpecialsDir() {
+        return specialsDir;
+    }
+
+    @Override public boolean emitReport() {
+        return emitReport;
+    }
+
+    // Currently hard-coded "hacks" which could be encoded via the builder if wanted.
+
+    @Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
+        switch (dir) {
+        case COLL:
+            return ImmutableMap.<String, String>builder()
+                // It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
+                // TODO: Find out and document this properly.
+                .put("sr_ME", "sr_Cyrl_ME")
+
+                // This appears to be a hack to avoid needing to copy and maintain the same "zh"
+                // data for "yue". The files for "yue" in this directory should be empty otherwise.
+                //
+                // The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
+                // "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
+                // rewriting the base language.
+                .put("yue_Hans", "zh_Hans")
+                .put("yue", "zh_Hant")
+                .build();
+        case RBNF:
+            // It is not at all clear why this is being done. It's certainly not exactly the same
+            // as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
+            // data than "yue", so this alias is not just rewriting the base language.
+            // TODO: Find out and document this properly.
+            return ImmutableMap.of("zh_Hant_HK", "yue");
+        default:
+            return ImmutableMap.of();
+        }
+    }
+
+    // This set of locale files in each directory denotes the supported/available locales for that
+    // API. In most cases, it's the same set, but a few directories support only a subset of IDs.
+    @Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
+        switch (dir) {
+        case COLL:
+            return COLL_LOCALE_IDS;
+        case BRKITR:
+            return BRKITR_LOCALE_IDS;
+        case RBNF:
+            return RBNF_LOCALE_IDS;
+        default:
+            return ICU_LOCALE_IDS;
+        }
+    }
+
+    // The primary set of locale IDs to be generated. Other, directory specific, sets should be
+    // subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
+    //
+    // This was further modified (in order to better match the set of generated ICU files) by:
+    // * Removing "es_003" (which just seems to be ignored in current code)
+    // * Adding:  "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
+    // * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
+    private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
+        "root",
+        // A
+        "af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
+        "ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
+        "ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
+        "ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
+        "as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
+        "az_Latn", "az_Latn_AZ",
+        // B
+        "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
+        "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
+        "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
+        // C
+        "ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
+        "ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
+        "cs_CZ", "cy", "cy_GB",
+        // D
+        "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
+        "de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
+        "dyo_SN", "dz", "dz_BT",
+        // E
+        "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
+        "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
+        "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
+        "en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
+        "en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
+        "en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
+        "en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
+        "en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
+        "en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
+        "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
+        "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
+        "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
+        "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
+        "eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
+        "es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
+        "es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
+        "es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
+        // F
+        "fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
+        "ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
+        "ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
+        "fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
+        "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
+        "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
+        "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
+        "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
+        "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
+        "fy", "fy_NL",
+        // G
+        "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
+        "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
+        // H
+        "ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
+        "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
+        // I
+        "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
+        "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
+        // J
+        "ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
+        // K
+        "ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
+        "khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
+        "kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
+        "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
+        "kw", "kw_GB", "ky", "ky_KG",
+        // L
+        "lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
+        "ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
+        "lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
+        // M
+        "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
+        "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
+        "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
+        "mua_CM", "my", "my_MM", "mzn", "mzn_IR",
+        // N
+        "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
+        "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
+        "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
+        "nus", "nus_SS", "nyn", "nyn_UG",
+        // O
+        "om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
+        // P
+        "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
+        "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
+        "pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
+        // Q
+        "qu", "qu_BO", "qu_EC", "qu_PE",
+        // R
+        "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
+        "ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
+        // S
+        "sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
+        "se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
+        "sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
+        "shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
+        "sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
+        "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
+        "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
+        "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
+        "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
+        // T
+        "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
+        "tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
+        "tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
+        "twq", "twq_NE", "tzm", "tzm_MA",
+        // U
+        "ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
+        "uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
+        // V
+        "vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
+        "vi_VN", "vun", "vun_TZ",
+        // W
+        "wae", "wae_CH", "wo", "wo_SN",
+        // X
+        "xh", "xh_ZA", "xog", "xog_UG",
+        // Y
+        "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
+        "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
+        // Z
+        "zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
+        "zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
+        "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
+
+    private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
+        "root",
+        // A-B
+        "af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
+        // C-F
+        "ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
+        "en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
+        // G-J
+        "ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
+        "id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
+        // K-P
+        "ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
+        "mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
+        "om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
+        // R-T
+        "ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
+        "sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
+        "ta", "te", "th", "tk", "to", "tr",
+        // U-Z
+        "ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
+        "yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
+
+    private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
+        "root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
+        "zh_Hant", "zh");
+
+    private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
+        "root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
+        "cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
+        "es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
+        "fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
+        "hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
+        "lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
+        "pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
+        "sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
+        "zh_MO", "zh_TW", "zh");
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuData.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuData.java

new file mode 100644 (file)

index 0000000..63959d7
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuData.java
@@ -0,0 +1,165 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ListMultimap;
+
+/**
+ * Mutable ICU data, represented as a mapping from resource bundle paths to a sequence of values.
+ */
+public final class IcuData {
+    private static final RbPath RB_VERSION = RbPath.of("Version");
+    private static final Pattern ARRAY_INDEX = Pattern.compile("(/[^\\[]++)(?:\\[(\\d++)\\])?$");
+
+    private final String name;
+    private final boolean hasFallback;
+    private final NavigableSet<RbPath> paths = new TreeSet<>();
+    private final ListMultimap<RbPath, RbValue> rbPathToValues = ArrayListMultimap.create();
+    private ImmutableList<String> commentLines = ImmutableList.of();
+
+    /**
+     * IcuData constructor.
+     *
+     * @param name The name of the IcuData object, used as the name of the root node in the output file
+     * @param hasFallback true if the output file has another ICU file as a fallback.
+     */
+    public IcuData(String name, boolean hasFallback) {
+        this.hasFallback = hasFallback;
+        this.name = name;
+    }
+
+    /** @return whether data should fallback on data in other ICU files. */
+    public boolean hasFallback() {
+        return hasFallback;
+    }
+
+    /**
+     * @return the name of this ICU data instance. Used in the output filename, and in comments.
+     */
+    public String getName() {
+        return name;
+    }
+
+    /** Sets additional comment lines for the top of the file. */
+    public void setFileComment(String... commentLines) {
+        setFileComment(Arrays.asList(commentLines));
+    }
+
+    public void setFileComment(Iterable<String> commentLines) {
+        this.commentLines = ImmutableList.copyOf(commentLines);
+    }
+
+    public List<String> getFileComment() {
+        return commentLines;
+    }
+
+    /** Adds a singleton resource bundle value for a given path. */
+    public void add(RbPath rbPath, String element) {
+        add(rbPath, RbValue.of(element));
+    }
+
+    /** Adds a single resource bundle value for a given path. */
+    public void add(RbPath rbPath, RbValue rbValue) {
+        rbPathToValues.put(rbPath, rbValue);
+        paths.add(rbPath);
+    }
+
+    /** Adds a sequence of resource bundle values for a given path. */
+    public void add(RbPath rbPath, Iterable<RbValue> rbValues) {
+        rbValues.forEach(v -> rbPathToValues.put(rbPath, v));
+        paths.add(rbPath);
+    }
+
+    /** Replaces all resource bundle values for a given path with the specified singleton value. */
+    public void replace(RbPath rbPath, String element) {
+        rbPathToValues.removeAll(rbPath);
+        rbPathToValues.put(rbPath, RbValue.of(element));
+        paths.add(rbPath);
+    }
+
+    /** Replaces all resource bundle values for a given path with the specified value. */
+    public void replace(RbPath rbPath, RbValue rbValue) {
+        rbPathToValues.removeAll(rbPath);
+        add(rbPath, rbValue);
+    }
+
+    public void setVersion(String versionString) {
+        add(RB_VERSION, versionString);
+    }
+
+    public void addResults(ListMultimap<RbPath, PathValueTransformer.Result> resultsByRbPath) {
+        for (RbPath rbPath : resultsByRbPath.keySet()) {
+            for (PathValueTransformer.Result r : resultsByRbPath.get(rbPath)) {
+                if (r.isGrouped()) {
+                    // Grouped results have all the values in a single value entry.
+                    add(rbPath, RbValue.of(r.getValues()));
+                } else {
+                    if (rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")) {
+                        r.getValues().forEach(v -> add(rbPath, RbValue.of(v)));
+                    } else {
+                        // Ungrouped results are one value per entry, but might be expanded into
+                        // grouped results if they are a path referencing a grouped entry.
+                        r.getValues().forEach(v -> add(rbPath, replacePathValues(v)));
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Replaces an ungrouped CLDR value for the form "/foo/bar" or "/foo/bar[N]" which is assumed
+     * to be a reference to an existing value in a resource bundle. Note that the referenced bundle
+     * might be grouped (i.e. an array with more than one element).
+     */
+    private RbValue replacePathValues(String value) {
+        Matcher m = ARRAY_INDEX.matcher(value);
+        if (!m.matches()) {
+            return RbValue.of(value);
+        }
+        // The only constraint is that the "path" value starts with a leading '/', but parsing into
+        // the RbPath ignores this. We must use "parse()" here, rather than RbPath.of(), since the
+        // captured value contains '/' characters to represent path delimiters.
+        RbPath replacePath = RbPath.parse(m.group(1));
+        List<RbValue> replaceValues = get(replacePath);
+        checkArgument(replaceValues != null, "Path %s is missing from IcuData", replacePath);
+        // If no index is given (e.g. "/foo/bar") then treat it as index 0 (i.e. "/foo/bar[0]").
+        int replaceIndex = m.groupCount() > 1 ? Integer.parseInt(m.group(2)) : 0;
+        return replaceValues.get(replaceIndex);
+    }
+
+    /**
+     * Returns the mutable list of values associated with the given path (or null if there are no
+     * associated values).
+     */
+    public List<RbValue> get(RbPath rbPath) {
+        return paths.contains(rbPath) ? rbPathToValues.get(rbPath) : null;
+    }
+
+    /** Returns an unmodifiable view of the set of paths in this instance. */
+    public Set<RbPath> getPaths() {
+        return Collections.unmodifiableSet(paths);
+    }
+
+    /** Returns whether the given path is present in this instance. */
+    public boolean contains(RbPath rbPath) {
+        return paths.contains(rbPath);
+    }
+
+    /** Returns whether there are any paths in this instance. */
+    public boolean isEmpty() {
+        return paths.isEmpty();
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuDataDumper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuDataDumper.java

new file mode 100644 (file)

index 0000000..13dbcd3
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuDataDumper.java
@@ -0,0 +1,381 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkElementIndex;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+
+/**
+ * Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
+ * ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
+ * extended to be a more fully featured "diff" tool or a proper ICU data file parser.
+ *
+ * <p>This is a temporary debugging tool and should not be relied upon during any part of the data
+ * generation process.
+ */
+final class IcuDataDumper {
+    private static final Joiner LIST_JOINER = Joiner.on(',');
+    private static final RbPath VERSION = RbPath.of("Version");
+
+    public static void main(String... args) throws IOException {
+        Path fileOrDir;
+        Optional<Pattern> name = Optional.empty();
+        switch (args.length) {
+        case 2:
+            name = Optional.of(Pattern.compile(args[1]));
+        case 1:
+            fileOrDir = Paths.get(args[0]);
+            break;
+        default:
+            throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
+        }
+
+        if (Files.isDirectory(fileOrDir)) {
+            walkDirectory(fileOrDir, name);
+        } else {
+            checkArgument(!name.isPresent(),
+                "cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
+            IcuDataParser parser = new IcuDataParser(fileOrDir);
+            parser.parse();
+            dump(parser.icuData);
+        }
+    }
+
+    private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
+        Predicate<Path> matchesName =
+            f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
+        List<IcuDataParser> icuParsers;
+        try (Stream<Path> files = Files.walk(fileOrDir)) {
+            icuParsers = files
+                .filter(Files::isRegularFile)
+                .filter(matchesName)
+                .map(IcuDataParser::new)
+                .collect(toImmutableList());
+        }
+        ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
+        for (IcuDataParser p : icuParsers) {
+            p.parse();
+            for (RbPath k : p.icuData.keySet()) {
+                List<RbValue> values = p.icuData.get(k);
+                if (!allPaths.containsKey(k)) {
+                    allPaths.putAll(k, values);
+                } else if (!VERSION.equals(k)) {
+                    checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
+                }
+            }
+        }
+        dump(allPaths);
+    }
+
+    private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
+        allPaths.keySet().stream()
+            .sorted()
+            .forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
+    }
+
+    private static final class IcuDataParser {
+        // Path of file being parsed.
+        private final Path path;
+
+        // Comments in header (before data starts), without comment characters.
+        private final List<String> headerComment = new ArrayList<>();
+        // ICU data name (the name of the root element).
+        private String name = null;
+        // ICU data values.
+        private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
+
+        // Current line number (1-indexed).
+        private int lineNumber = 0;
+        // The type of the previous line that was processed.
+        private LineType lastType = LineType.COMMENT;
+        // True when inside /* .. */ comments in the header.
+        private boolean inBlockComment = false;
+        // True when in the final top-level group at the end of parsing.
+        private boolean inFinalGroup = false;
+        // True when a partial (line wrapped) value has been read.
+        private boolean isLineContinuation = false;
+        // Current path while parsing (NOT including the root element).
+        private Deque<String> pathStack = new ArrayDeque<>();
+        // Current sequence of values for the path (as defined in the current path stack).
+        private List<String> currentValue = new ArrayList<>();
+        // Current partially read value of a multi-line value.
+        private String wrappedValue = "";
+        // Map of indices used to auto-generate names for anonymous path segments.
+        // TODO: Check if this is even needed and remove if not.
+        private Multiset<Integer> indices = HashMultiset.create();
+
+        IcuDataParser(Path path) {
+            this.path = checkNotNull(path);
+        }
+
+        public boolean parse() throws IOException {
+            List<String> lines = Files.readAllLines(path);
+            // Best approximation to a magic number be have (BOM plus inline comment). This stops
+            // use trying to parse the transliteration files, which are a different type.
+            if (!lines.get(0).startsWith("\uFEFF//")) {
+                return false;
+            }
+            lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
+
+            // Sanity check for expected final state. Just checking the "lastType" should be enough
+            // to catch everything else (due to transition rules and how the code tidies up) but it
+            // seems prudent to sanity check everything just in case.
+            checkState(lastType == LineType.GROUP_END);
+            checkState(!inBlockComment);
+            checkState(name != null);
+            checkState(pathStack.isEmpty() && inFinalGroup);
+            checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
+            return true;
+        }
+
+        void processLineWithCheck(String line) {
+            lineNumber++;
+            if (lineNumber == 1 && line.startsWith("\uFEFF")) {
+                line = line.substring(1);
+            }
+            try {
+                processLine(line);
+            } catch (RuntimeException e) {
+                throw new RuntimeException(
+                    String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
+                    e);
+            }
+        }
+
+        void processLine(String line) {
+            line = maybeTrimEndOfLineComment(line);
+            if (line.isEmpty()) {
+                return;
+            }
+            LineMatch match = LineType.match(line, inBlockComment);
+            checkState(match.getType().isValidTransitionFrom(lastType),
+                "invalid state transition: %s --//-> %s", lastType, match.getType());
+            boolean isEndOfWrappedValue = false;
+            switch (match.getType()) {
+            case COMMENT:
+                if (name != null) {
+                    // Comments in data are ignored since they cannot be properly associated with
+                    // paths or values in an IcuData instance (only legacy tooling emits these).
+                    break;
+                }
+                if (line.startsWith("/*")) {
+                    inBlockComment = true;
+                }
+                headerComment.add(match.get(0));
+                if (inBlockComment && line.contains("*/")) {
+                    checkState(line.indexOf("*/") == line.length() - 2,
+                        "unexpected end of comment block");
+                    inBlockComment = false;
+                }
+                break;
+
+            case INLINE_VALUE:
+                icuData.put(
+                    getPathFromStack().extendBy(getSegment(match.get(0))),
+                    RbValue.of(unquote(match.get(1))));
+                break;
+
+            case GROUP_START:
+                checkState(currentValue.isEmpty());
+                if (name == null) {
+                    name = match.get(0);
+                    checkState(name != null, "cannot have anonymous top-level group");
+                } else {
+                    pathStack.push(getSegment(match.get(0)));
+                }
+                wrappedValue = "";
+                isLineContinuation = false;
+                break;
+
+            case QUOTED_VALUE:
+                wrappedValue += unquote(match.get(0));
+                isLineContinuation = !line.endsWith(",");
+                if (!isLineContinuation) {
+                    currentValue.add(wrappedValue);
+                    wrappedValue = "";
+                }
+                break;
+
+            case VALUE:
+                checkState(!isLineContinuation, "unexpected unquoted value");
+                currentValue.add(match.get(0));
+                break;
+
+            case GROUP_END:
+                // Account for quoted values without trailing ',' just before group end.
+                if (isLineContinuation) {
+                    currentValue.add(wrappedValue);
+                    isLineContinuation = false;
+                }
+                // Emit the collection sequence of values for the current path as an RbValue.
+                if (!currentValue.isEmpty()) {
+                    icuData.put(getPathFromStack(), RbValue.of(currentValue));
+                    currentValue.clear();
+                }
+                // Annoyingly the name is outside the stack so the stack will empty before the last
+                // end group.
+                if (!pathStack.isEmpty()) {
+                    pathStack.pop();
+                    indices.setCount(pathStack.size(), 0);
+                } else {
+                    checkState(!inFinalGroup, "unexpected group end");
+                    inFinalGroup = true;
+                }
+                break;
+
+            case UNKNOWN:
+                throw new IllegalStateException("cannot parse line: " + match.get(0));
+            }
+            lastType = match.getType();
+        }
+
+        private RbPath getPathFromStack() {
+            if (pathStack.isEmpty()) {
+                return RbPath.empty();
+            }
+            List<String> segments = new ArrayList<>();
+            Iterables.addAll(segments, pathStack);
+            if (segments.get(0).matches("<[0-9]{4}>")) {
+                segments.remove(0);
+            }
+            return segments.isEmpty() ? RbPath.empty() : RbPath.of(Lists.reverse(segments));
+        }
+
+        private String getSegment(String segmentOrNull) {
+            if (segmentOrNull != null) {
+                return segmentOrNull;
+            }
+            int depth = pathStack.size();
+            int index = indices.count(depth);
+            indices.add(depth, 1);
+            return String.format("<%04d>", index);
+        }
+
+        private String maybeTrimEndOfLineComment(String line) {
+            // Once the name is set, we are past the header and into the data.
+            if (name != null) {
+                // Index to search for '//' from - must skip quoted values.
+                int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
+                int commentIdx = line.indexOf("//", startIdx);
+                if (commentIdx != -1) {
+                    line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
+                }
+            }
+            return line;
+        }
+
+        private static String unquote(String s) {
+            if (s.startsWith("\"") && s.endsWith("\"")) {
+                return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
+            }
+            checkState(!s.contains("\""), "invalid unquoted value: %s", s);
+            return s;
+        }
+
+        private static final class LineMatch {
+            private final LineType type;
+            private final Function<Integer, String> args;
+
+            LineMatch(LineType type, Function<Integer, String> args) {
+                this.type = checkNotNull(type);
+                this.args = checkNotNull(args);
+            }
+
+            String get(int n) {
+                return args.apply(n);
+            }
+
+            LineType getType() {
+                return type;
+            }
+        }
+
+        private enum LineType {
+            // Comment _start_ with any comment value captured.
+            COMMENT("(?://|/\\*)\\s*(.*)"),
+            // A combination of GROUP_START, VALUE and GROUP_END with whitespace.
+            INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
+            // Allows for empty segment names (anonymous arrays) which match 'null'.
+            GROUP_START("(?:(.*\\S)\\s*)?\\{"),
+            GROUP_END("\\}"),
+            QUOTED_VALUE("(\".*\"),?"),
+            VALUE("([^\"{}]+),?"),
+            UNKNOWN(".*");
+
+            // Table of allowed transitions expected during parsing.
+            // key=current state, values=set of permitted previous states
+            private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
+                ImmutableSetMultimap.<LineType, LineType>builder()
+                    .putAll(COMMENT, COMMENT)
+                    .putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
+                    .putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
+                    .putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
+                    .putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
+                    .putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
+                    .build();
+
+            private final Pattern pattern;
+
+            LineType(String regex) {
+                this.pattern = Pattern.compile(regex);
+            }
+
+            boolean isValidTransitionFrom(LineType lastType) {
+                return TRANSITIONS.get(this).contains(lastType);
+            }
+
+            static LineMatch match(String line, boolean inBlockComment) {
+                // Block comments kinda suck and it'd be great if the ICU data only used '//' style
+                // comments (if would definitely simplify any parsers out there). Once the
+                // transition to the new transformation tools is complete, they can be changed to
+                // only emit '//' style comments.
+                if (inBlockComment) {
+                    if (line.startsWith("*")) {
+                        line = whitespace().trimLeadingFrom(line.substring(1));
+                    }
+                    return new LineMatch(COMMENT, ImmutableList.of(line)::get);
+                }
+                for (LineType type : TRANSITIONS.keySet()) {
+                    // Regex groups start at 1, but we want the getter function to be zero-indexed.
+                    Matcher m = type.pattern.matcher(line);
+                    if (m.matches()) {
+                        return new LineMatch(type, n -> {
+                            checkElementIndex(n, m.groupCount());
+                            return m.group(n + 1);
+                        });
+                    }
+                }
+                return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
+            }
+        }
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuFunctions.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuFunctions.java

new file mode 100644 (file)

index 0000000..a2f8313
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuFunctions.java
@@ -0,0 +1,209 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.lang.Integer.parseInt;
+
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.ZoneOffset;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Ascii;
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableMap;
+import org.unicode.icu.tool.cldrtoicu.regex.NamedFunction;
+
+/**
+ * The named functions used by the {@code RegexTransformer} for {@code ldml2icu_supplemental.txt}.
+ */
+final class IcuFunctions {
+    /**
+     * Converts an ISO date string to a space-separated pair of integer values representing the top
+     * and bottom parts of a deconstructed millisecond epoch value (i.e. {@code
+     * "<hi32bits> <low32bits>"}).
+     *
+     * <p>Note that the values are formatted as <em>signed</em> decimal values, so it's entirely
+     * possible that the low bits value will be appear as a negative number (the high bits won't
+     * appear negative for many thousands of years).
+     *
+     * <ul>
+     *   <li>args[0] = ISO date string (e.g. "2019-05-23")
+     *   <li>args[1] = Date field type name (e.g. "from")
+     * </ul>
+     */
+    static final NamedFunction DATE_FN =
+        NamedFunction.create("date", 2, args -> {
+            long millis =
+                DateFieldType.toEnum(args.get(1)).toEpochMillis(LocalDate.parse(args.get(0)));
+            // Strictly speaking the masking is redundant and could be removed.
+            int hiBits = (int) ((millis >>> 32) & 0xFFFFFFFFL);
+            int loBits = (int) (millis & 0xFFFFFFFFL);
+            return hiBits + " " + loBits;
+        });
+
+    // TODO(dbeaumont): Improve this documentation (e.g. why is this being done, give examples?).
+    /**
+     * Inserts '%' into numberingSystems descriptions.
+     *
+     * <ul>
+     *   <li>args[0] = numbering system description (string)
+     * </ul>
+     */
+    static final NamedFunction ALGORITHM_FN =
+        NamedFunction.create("algorithm", 1, args -> {
+            String value = args.get(0);
+            int percentPos = value.lastIndexOf('/') + 1;
+            return value.substring(0, percentPos) + '%' + value.substring(percentPos);
+        });
+
+    /**
+     * Converts a number into a special integer that represents the number in normalized scientific
+     * notation for ICU's RB parser.
+     *
+     * <p>Resultant integers are in the form "xxyyyyyy", where "xx" is the exponent offset by 50
+     * and "yyyyyy" is the coefficient to 5 decimal places. Results may also have a leading '-' to
+     * denote negative values.
+     *
+     * <p>For example:
+     * <pre>{@code
+     * 14660000000000 -> 1.466E13    -> 63146600
+     * 0.0001         -> 1E-4        -> 46100000
+     * -123.456       -> -1.23456E-2 -> -48123456
+     * }</pre>
+     *
+     * <p>The additional exponent offset is applied directly to the calculated exponent and is used
+     * to do things like converting percentages into their decimal representation (i.e. by passing
+     * a value of "-2").
+     *
+     * <ul>
+     *   <li>args[0] = number to be converted (double)
+     *   <li>args[1] = additional exponent offset (integer)
+     * </ul>
+     */
+    static final NamedFunction EXP_FN =
+        NamedFunction.create("exp", 2, args -> {
+            double value = Double.parseDouble(args.get(0));
+            if (value == 0) {
+                return "0";
+            }
+            int exponent = 50;
+            if (args.size() == 2) {
+                exponent += Integer.parseInt(args.get(1));
+            }
+            String sign = value >= 0 ? "" : "-";
+            value = Math.abs(value);
+            while (value >= 10) {
+                value /= 10;
+                exponent++;
+            }
+            while (value < 1) {
+                value *= 10;
+                exponent--;
+            }
+            if (exponent < 0 || exponent > 99) {
+                throw new IllegalArgumentException("Exponent out of bounds: " + exponent);
+            }
+            return sign + exponent + Math.round(value * 100000);
+        });
+
+    // Allow for single digit values in any part and negative year values.
+    private static final Pattern YMD = Pattern.compile("(-?[0-9]+)-([0-9]{1,2})-([0-9]{1,2})");
+
+    /**
+     * Converts an ISO date string (i.e. "YYYY-MM-DD") into an ICU date string, which is
+     * the same but with spaces instead of hyphens. Since functions are expanded before the
+     * resulting value is split, this function will result in 3 separate values being created,
+     * unless the function call is enclosed in quotes.
+     *
+     * <p>Note that for some cases (e.g. "eras") the year part can be negative (e.g. "-2165-1-1")
+     * so this is not as simple as "split by hyphen".
+     *
+     * <ul>
+     *   <li>args[0] = ISO date string (e.g. "2019-05-23" or "-2165-1-1")
+     * </ul>
+     */
+    static final NamedFunction YMD_FN =
+        NamedFunction.create("ymd", 1, args -> {
+            Matcher m = YMD.matcher(args.get(0));
+            checkArgument(m.matches(), "invalid year-month-day string: %s", args.get(0));
+            // NOTE: Re-parsing is not optional since it removes leading zeros (needed for ICU).
+            return String.format("%s %s %s",
+                parseInt(m.group(1)), parseInt(m.group(2)), parseInt(m.group(3)));
+        });
+
+    // For transforming day-of-week identifiers.
+    private static final ImmutableMap<String, String> WEEKDAY_MAP_ID =
+        ImmutableMap.<String, String>builder()
+            .put("sun", "1")
+            .put("mon", "2")
+            .put("tues", "3")
+            .put("wed", "4")
+            .put("thu", "5")
+            .put("fri", "6")
+            .put("sat", "7")
+            .build();
+
+    /**
+     * Converts a day-of-week identifier into its ordinal value (e.g. "sun" --> 1, "mon" --> 2 ...).
+     */
+    static final NamedFunction DAY_NUMBER_FN =
+        NamedFunction.create("day_number", 1,
+            args -> {
+                String id = WEEKDAY_MAP_ID.get(args.get(0));
+                checkArgument(id != null, "unknown weekday: %s", args.get(0));
+                return id;
+            });
+
+    // For transform IDs in <contextTransform> elements.
+    private static final ImmutableMap<String, String> TRANSFORM_ID_MAP =
+        ImmutableMap.of("no-change", "0", "titlecase-firstword", "1");
+
+    /**
+     * Converts the transform type in the {@code <contextTransform>} element into its ICU index
+     * (e.g. "titlecase-firstword" --> 1).
+     */
+    static final NamedFunction CONTEXT_TRANSFORM_INDEX_FN =
+        NamedFunction.create("context_transform_index", 1,
+            args -> {
+                String id = TRANSFORM_ID_MAP.get(args.get(0));
+                checkArgument(id != null, "unknown contextTransform: %s", args.get(0));
+                return id;
+            });
+
+    // For DATE_FN only.
+    private enum DateFieldType {
+        from(LocalDate::atStartOfDay),
+        // Remember that atTime() takes nanoseconds, not micro or milli.
+        to(d -> d.atTime(23, 59, 59, 999_000_000));
+
+        private final Function<LocalDate, LocalDateTime> adjustFn;
+
+        DateFieldType(Function<LocalDate, LocalDateTime> adjustFn) {
+            this.adjustFn = adjustFn;
+        }
+
+        long toEpochMillis(LocalDate date) {
+            return adjustFn.apply(date).toInstant(ZoneOffset.UTC).toEpochMilli();
+        }
+
+        static DateFieldType toEnum(String value) {
+            switch (Ascii.toLowerCase(CharMatcher.whitespace().trimFrom(value))) {
+            case "from":
+            case "start":
+                return from;
+            case "to":
+            case "end":
+                return to;
+            default:
+                throw new IllegalArgumentException(value + " is not a valid date field type");
+            }
+        }
+    }
+
+    private IcuFunctions() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java

new file mode 100644 (file)

index 0000000..c5f2fe8
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java
@@ -0,0 +1,313 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.util.stream.Collectors.joining;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Writer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Writes an IcuData object to a text file. A lot of this class was copied directly from the
+ * original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
+ * behaviours. The behaviour of this class is currently tuned to produce perfect parity with
+ * the original conversion tools, but once migration of the tools is complete, it should
+ * probably be revisited and tidied up.
+ */
+// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
+final class IcuTextWriter {
+    private static final String INDENT = "    ";
+    // List of characters to escape in UnicodeSets
+    // ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
+    private static final Pattern UNICODESET_ESCAPE =
+        Pattern.compile("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
+    // Only escape \ and " from other strings.
+    private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
+    private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
+
+    /** Write a file in ICU data format with the specified header. */
+    static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
+        try {
+            Files.createDirectories(outDir);
+            try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
+                PrintWriter out = new PrintWriter(w)) {
+                new IcuTextWriter(icuData).writeTo(out, header);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
+        }
+    }
+
+    private final IcuData icuData;
+    private int depth = 0;
+    private boolean valueWasInline = false;
+
+    IcuTextWriter(IcuData icuData) {
+        this.icuData = checkNotNull(icuData);
+    }
+
+    // TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
+    private void writeTo(PrintWriter out, List<String> header) throws IOException {
+        out.write('\uFEFF');
+        writeHeaderAndComments(out, header, icuData.getFileComment());
+
+        // Write the ICU data to file. This takes the form:
+        // ----
+        // <name>{
+        //     foo{
+        //         bar{baz}
+        //     }
+        // }
+        // ----
+        // So it's like every RbPath has an implicit prefix of the IcuData name.
+        String root = icuData.getName();
+        if (!icuData.hasFallback()) {
+            root += ":table(nofallback)";
+        }
+        // TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
+        out.print(root);
+        out.print("{");
+        depth++;
+
+        RbPath lastPath = RbPath.empty();
+        for (RbPath path : icuData.getPaths()) {
+            // Close any blocks up to the common path length. Since paths are all distinct, the
+            // common length should always be shorter than either path. We add 1 since we must also
+            // account for the implicit root segment.
+            int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
+            // Before closing, the "cursor" is at the end of the last value written.
+            closeLastPath(lastPath, commonDepth, out);
+            // After opening the value will be ready for the next value to be written.
+            openNextPath(path, out);
+            valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
+            lastPath = path;
+        }
+        closeLastPath(lastPath, 0, out);
+        out.println();
+        out.close();
+    }
+
+    // Before: Cursor is at the end of the previous line.
+    // After: Cursor is positioned immediately after the last closed '}'
+    private void closeLastPath(RbPath lastPath, int minDepth, PrintWriter out) {
+        if (valueWasInline) {
+            depth--;
+            out.print('}');
+            valueWasInline = false;
+        }
+        while (depth > minDepth) {
+            close(out);
+        }
+    }
+
+    // Before: Cursor is at the end of the previous line.
+    // After: Cursor is positioned immediately after the newly opened '{'
+    private void openNextPath(RbPath path, PrintWriter out) {
+        while (depth <= path.length()) {
+            // The -1 is to adjust for the implicit root element which means indentation (depth)
+            // no longer matches the index of the segment we are writing.
+            open(path.getSegment(depth - 1), out);
+        }
+    }
+
+    private void open(String label, PrintWriter out) {
+        newLineAndIndent(out);
+        depth++;
+        // This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
+        // These take the form of "<any-string>" and are used to ensure that path order can be
+        // well defined even for anonymous lists of items.
+        if (!label.startsWith("<") && !label.endsWith(">")) {
+            out.print(label);
+        }
+        out.print('{');
+    }
+
+    private void close(PrintWriter out) {
+        depth--;
+        newLineAndIndent(out);
+        out.print('}');
+    }
+
+    private void newLineAndIndent(PrintWriter out) {
+        out.println();
+        for (int i = 0; i < depth; i++) {
+            out.print(INDENT);
+        }
+    }
+
+    // Currently the "header" uses '//' line comments but the comments are in a block.
+    // TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
+    private static void writeHeaderAndComments(
+        PrintWriter out, List<String> header, List<String> comments) {
+        header.forEach(out::println);
+        if (!comments.isEmpty()) {
+            // TODO: Don't use /* */ block quotes, just use inline // quotes.
+            out.println(
+                comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
+        }
+    }
+
+    /** Inserts padding and values between braces. */
+    private boolean appendValues(
+        String name, RbPath rbPath, List<RbValue> values, PrintWriter out) {
+
+        RbValue onlyValue;
+        boolean wasSingular = false;
+        boolean quote = !rbPath.isIntPath();
+        boolean isSequence = rbPath.endsWith(RB_SEQUENCE);
+        if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
+            onlyValue = values.get(0);
+            if (onlyValue.size() == 1 && !mustBeArray(false, name, rbPath)) {
+                // Value has a single element and is not being forced to be an array.
+                String onlyElement = onlyValue.getElement(0);
+                if (quote) {
+                    onlyElement = quoteInside(onlyElement);
+                }
+                // The numbers below are simply tuned to match the line wrapping in the original
+                // CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
+                // for a single character) and could definitely be improved.
+                // TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
+                int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
+                if (onlyElement.length() <= maxWidth) {
+                    // Single element for path: don't add newlines.
+                    printValue(out, onlyElement, quote);
+                    wasSingular = true;
+                } else {
+                    // Element too long to fit in one line, so wrap.
+                    int end;
+                    for (int i = 0; i < onlyElement.length(); i = end) {
+                        end = goodBreak(onlyElement, i + maxWidth);
+                        String part = onlyElement.substring(i, end);
+                        newLineAndIndent(out);
+                        printValue(out, part, quote);
+                    }
+                }
+            } else {
+                // Only one array for the rbPath, so don't add an extra set of braces.
+                printArray(onlyValue, quote, isSequence, out);
+            }
+        } else {
+            for (RbValue value : values) {
+                if (value.size() == 1) {
+                    // Single-value array: print normally.
+                    printArray(value, quote, isSequence, out);
+                } else {
+                    // Enclose this array in braces to separate it from other values.
+                    open("", out);
+                    printArray(value, quote, isSequence, out);
+                    close(out);
+                }
+            }
+        }
+        return wasSingular;
+    }
+
+    private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
+    private static final RbPath RB_RULES = RbPath.of("rules");
+    private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
+    private static final RbPath RB_ERAS = RbPath.of("eras");
+    private static final RbPath RB_NAMED = RbPath.of("named");
+    private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
+    private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
+
+    /**
+     * Wrapper for a hack to determine if the given rb path should always present its values as an
+     * array.
+     */
+    // TODO: Verify this is still needed, and either make it less hacky, or delete it.
+    private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
+        if (topValues) {
+            // matches "rules/setNN" (hence the mucking about with raw segments).
+            return name.equals("pluralRanges")
+                && rbPath.startsWith(RB_RULES)
+                && rbPath.getSegment(1).startsWith("set");
+        }
+        return rbPath.equals(RB_LOCALE_SCRIPT)
+            || (rbPath.contains(RB_ERAS)
+                && !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
+                && !rbPath.endsWith(RB_NAMED))
+            || rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
+            || rbPath.startsWith(RB_METAZONE_INFO);
+    }
+
+    private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) {
+        for (int n = 0; n < rbValue.size(); n++) {
+            newLineAndIndent(out);
+            printValue(out, quoteInside(rbValue.getElement(n)), quote);
+            if (!isSequence) {
+                out.print(",");
+            }
+        }
+    }
+
+    private static void printValue(PrintWriter out, String value, boolean quote) {
+        if (quote) {
+            out.append('"').append(value).append('"');
+        } else {
+            out.append(value);
+        }
+    }
+
+    // Can a string be broken here? If not, backup until we can.
+    // TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
+    private static int goodBreak(String quoted, int end) {
+        if (end > quoted.length()) {
+            return quoted.length();
+        }
+        // Don't break escaped Unicode characters.
+        // Need to handle both e.g. \u4E00 and \U00020000
+        for (int i = end - 1; i > end - 10;) {
+            char current = quoted.charAt(i--);
+            if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
+                if ((current == 'u' || current == 'U') && i > end - 10
+                    && quoted.charAt(i) == '\\') {
+                    return i;
+                }
+                break;
+            }
+        }
+        while (end > 0) {
+            char ch = quoted.charAt(end - 1);
+            if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
+                break;
+            }
+            --end;
+        }
+        return end;
+    }
+
+    // Fix characters inside strings.
+    private static String quoteInside(String item) {
+        // Unicode-escape all quotes.
+        item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
+        // Double up on backslashes, ignoring Unicode-escaped characters.
+        Pattern pattern =
+            item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
+        Matcher matcher = pattern.matcher(item);
+
+        if (!matcher.find()) {
+            return item;
+        }
+        StringBuilder buffer = new StringBuilder();
+        int start = 0;
+        do {
+            buffer.append(item, start, matcher.start());
+            int punctuationChar = item.codePointAt(matcher.end() - 1);
+            buffer.append("\\");
+            if (punctuationChar == '\\') {
+                buffer.append('\\');
+            }
+            buffer.append(matcher.group());
+            start = matcher.end();
+        } while (matcher.find());
+        buffer.append(item.substring(start));
+        return buffer.toString();
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java

new file mode 100644 (file)

index 0000000..9d1aaa3
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java
@@ -0,0 +1,618 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LANG;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LOCALES;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RBNF;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
+import static java.util.stream.Collectors.toList;
+import static org.unicode.cldr.api.CldrDataType.BCP47;
+import static org.unicode.cldr.api.CldrDataType.LDML;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.LinkedListMultimap;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.SetMultimap;
+import com.google.common.collect.Sets;
+import com.google.common.io.CharStreams;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
+import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.DayPeriodsMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.LocaleMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.PluralRangesMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.PluralsMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.RbnfMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
+import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
+
+/**
+ * The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
+ * {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
+ * class which can be invoked passing just the desired output directory and which relies on the
+ * presence of several system properties for the remainder of its parameters:
+ * <ul>
+ *     <li>CLDR_DIR: The root of the CLDR release from which CLDR data is read.
+ *     <li>ICU_DIR: The root of the ICU release from which additional "specials" XML data is read.
+ *     <li>CLDR_DTD_CACHE: A temporary directory with the various DTDs cached (this is a legacy
+ *         requirement from the underlying CLDR libraries and might go away one day).
+ * </ul>
+ */
+public final class LdmlConverter {
+    // TODO: Do all supplemental data in one go and split similarly to locale data (using RbPath).
+    private static final PathMatcher GENDER_LIST_PATHS =
+        supplementalMatcher("gender");
+    private static final PathMatcher LIKELY_SUBTAGS_PATHS =
+        supplementalMatcher("likelySubtags");
+    private static final PathMatcher METAZONE_PATHS =
+        supplementalMatcher("metaZones", "primaryZones");
+    private static final PathMatcher METADATA_PATHS =
+        supplementalMatcher("metadata");
+    private static final PathMatcher SUPPLEMENTAL_DATA_PATHS =
+        supplementalMatcher(
+            "calendarData",
+            "calendarPreferenceData",
+            "codeMappings",
+            "codeMappingsCurrency",
+            "idValidity",
+            "languageData",
+            "languageMatching",
+            "measurementData",
+            "parentLocales",
+            "subdivisionContainment",
+            "territoryContainment",
+            "territoryInfo",
+            "timeData",
+            "unitPreferenceData",
+            "weekData",
+            "weekOfPreference");
+    private static final PathMatcher CURRENCY_DATA_PATHS =
+        supplementalMatcher("currencyData");
+    private static final PathMatcher NUMBERING_SYSTEMS_PATHS =
+        supplementalMatcher("numberingSystems");
+    private static final PathMatcher WINDOWS_ZONES_PATHS =
+        supplementalMatcher("windowsZones");
+
+    // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
+    // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
+    // locale. However CLDR cannot represent this currently because calendar defaults are in
+    // supplemental data (rather than locale data) and are keyed only on territory.
+    private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
+        ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
+
+    // Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
+    // structurally valid locale ID. This is injected manually when creating the alias map.
+    // This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
+    // has been no need for that.
+    // TODO: Get "ars" into CLDR and remove this hack.
+    private static final Map<String, String> PHANTOM_ALIASES =
+        ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
+
+    private static PathMatcher supplementalMatcher(String... spec) {
+        checkArgument(spec.length > 0, "must supply at least one matcher spec");
+        if (spec.length == 1) {
+            return PathMatcher.of("supplementalData/" + spec[0]);
+        }
+        return PathMatcher.anyOf(
+            Arrays.stream(spec)
+                .map(s -> PathMatcher.of("supplementalData/" + s))
+                .toArray(PathMatcher[]::new));
+    }
+
+    private static RbPath RB_PARENT = RbPath.of("%%Parent");
+    // The quotes below are only so we achieve parity with the manually written alias files.
+    // TODO: Remove unnecessary quotes once the migration to this code is complete.
+    private static RbPath RB_ALIAS = RbPath.of("\"%%ALIAS\"");
+    // Special path for adding to empty files which only exist to complete the parent chain.
+    // TODO: Confirm that this has no meaningful effect and unify "empty" file contents.
+    private static RbPath RB_EMPTY_ALIAS = RbPath.of("___");
+
+    /** Provisional entry point until better config support exists. */
+    public static void main(String... args) {
+        convert(IcuConverterConfig.builder()
+            .setOutputDir(Paths.get(args[0]))
+            .setEmitReport(true)
+            .build());
+    }
+
+    /**
+     * Output types defining specific subsets of the ICU data which can be converted separately.
+     * This closely mimics the original "NewLdml2IcuConverter" behaviour but could be simplified to
+     * hide what are essentially implementation specific data splits.
+     */
+    public enum OutputType {
+        LOCALES(LDML, LdmlConverter::processLocales),
+        BRKITR(LDML, LdmlConverter::processBrkitr),
+        COLL(LDML, LdmlConverter::processCollation),
+        RBNF(LDML, LdmlConverter::processRbnf),
+
+        DAY_PERIODS(
+            SUPPLEMENTAL,
+            c -> c.processDayPeriods("misc")),
+        GENDER_LIST(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
+        LIKELY_SUBTAGS(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("likelySubtags", LIKELY_SUBTAGS_PATHS, "misc", false)),
+        SUPPLEMENTAL_DATA(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("supplementalData", SUPPLEMENTAL_DATA_PATHS, "misc", true)),
+        CURRENCY_DATA(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("supplementalData", CURRENCY_DATA_PATHS, "curr", true)),
+        METADATA(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("metadata", METADATA_PATHS, "misc", false)),
+        META_ZONES(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("metaZones", METAZONE_PATHS, "misc", false)),
+        NUMBERING_SYSTEMS(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
+        PLURALS(
+            SUPPLEMENTAL,
+            c -> c.processPlurals("misc")),
+        PLURAL_RANGES(
+            SUPPLEMENTAL,
+            c -> c.processPluralRanges("misc")),
+        WINDOWS_ZONES(
+            SUPPLEMENTAL,
+            c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
+        TRANSFORMS(
+            SUPPLEMENTAL,
+            c -> c.processTransforms("translit")),
+        KEY_TYPE_DATA(
+            BCP47,
+            c -> c.processKeyTypeData("misc")),
+
+        // Batching by type.
+        DTD_LDML(LDML, c -> c.processAll(LDML)),
+        DTD_SUPPLEMENTAL(SUPPLEMENTAL, c -> c.processAll(SUPPLEMENTAL)),
+        DTD_BCP47(BCP47, c -> c.processAll(BCP47));
+
+        public static final ImmutableSet<OutputType> ALL =
+            ImmutableSet.of(DTD_BCP47, DTD_SUPPLEMENTAL, DTD_LDML);
+
+        private final CldrDataType type;
+        private final Consumer<LdmlConverter> converterFn;
+
+        OutputType(CldrDataType type, Consumer<LdmlConverter> converterFn) {
+            this.type = checkNotNull(type);
+            this.converterFn = checkNotNull(converterFn);
+        }
+
+        void convert(LdmlConverter converter) {
+            converterFn.accept(converter);
+        }
+
+        CldrDataType getCldrType() {
+            return type;
+        }
+    }
+
+    private static void convert(LdmlConverterConfig config) {
+        CldrDataSupplier src = CldrDataSupplier
+            .forCldrFilesIn(config.getCldrDirectory())
+            .withDraftStatusAtLeast(config.getMinimumDraftStatus());
+        new LdmlConverter(config, src).convertAll(config);
+    }
+
+    // The configuration controlling conversion behaviour.
+    private final LdmlConverterConfig config;
+    // The supplier for all data to be converted.
+    private final CldrDataSupplier src;
+    // The set of available locale IDs.
+    // TODO: Make available IDs include specials files (or fail if specials are not available).
+    private final ImmutableSet<String> availableIds;
+    // Supplemental data available to mappers if needed.
+    private final SupplementalData supplementalData;
+    // Transformer for locale data.
+    private final PathValueTransformer localeTransformer;
+    // Transformer for supplemental data.
+    private final PathValueTransformer supplementalTransformer;
+    // Header string to go into every ICU data file.
+    private final ImmutableList<String> icuFileHeader;
+
+    private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
+        this.config = checkNotNull(config);
+        this.src = checkNotNull(src);
+        this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
+        // Sort the set of available locale IDs but add "root" at the front. This is the
+        // set of non-alias locale IDs to be processed.
+        Set<String> localeIds = new LinkedHashSet<>();
+        localeIds.add("root");
+        localeIds.addAll(
+            Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
+        localeIds.addAll(PHANTOM_LOCALE_IDS);
+        this.availableIds = ImmutableSet.copyOf(localeIds);
+
+        // Load the remaining path value transformers.
+        this.supplementalTransformer =
+            RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
+                IcuFunctions.ALGORITHM_FN,
+                IcuFunctions.DATE_FN,
+                IcuFunctions.DAY_NUMBER_FN,
+                IcuFunctions.EXP_FN,
+                IcuFunctions.YMD_FN);
+        this.localeTransformer =
+            RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
+                IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
+        this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
+    }
+
+    private void convertAll(LdmlConverterConfig config) {
+        ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
+        for (OutputType t : config.getOutputTypes()) {
+            groupByType.put(t.getCldrType(), t);
+        }
+        for (CldrDataType cldrType : groupByType.keySet()) {
+            for (OutputType t : groupByType.get(cldrType)) {
+                t.convert(this);
+            }
+        }
+        if (config.emitReport()) {
+            System.out.println("Supplemental Data Transformer=" + supplementalTransformer);
+            System.out.println("Locale Data Transformer=" + localeTransformer);
+        }
+    }
+
+    private static List<String> readLinesFromResource(String name) {
+        try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
+            return CharStreams.readLines(new InputStreamReader(in));
+        } catch (IOException e) {
+            throw new RuntimeException("cannot read resource: " + name, e);
+        }
+    }
+
+    private PathValueTransformer getLocaleTransformer() {
+        return localeTransformer;
+    }
+
+    private PathValueTransformer getSupplementalTransformer() {
+        return supplementalTransformer;
+    }
+
+    private void processAll(CldrDataType cldrType) {
+        List<OutputType> targets = Arrays.stream(OutputType.values())
+            .filter(t -> t.getCldrType().equals(cldrType))
+            .filter(t -> !t.name().startsWith("DTD_"))
+            .collect(toList());
+        for (OutputType t : targets) {
+            t.convert(this);
+        }
+    }
+
+    private Optional<CldrData> loadSpecialsData(String localeId) {
+        String expected = localeId + ".xml";
+        try (Stream<Path> files = Files.walk(config.getSpecialsDir())) {
+            Set<Path> xmlFiles = files
+                .filter(Files::isRegularFile)
+                .filter(f -> f.getFileName().toString().equals(expected))
+                .collect(Collectors.toSet());
+            return !xmlFiles.isEmpty()
+                ? Optional.of(
+                CldrDataSupplier.forCldrFiles(LDML, config.getMinimumDraftStatus(), xmlFiles))
+                : Optional.empty();
+        } catch (IOException e) {
+            throw new RuntimeException(
+                "error processing specials directory: " + config.getSpecialsDir(), e);
+        }
+    }
+
+    private void processLocales() {
+        // TODO: Pre-load specials files to avoid repeatedly re-loading them.
+        processAndSplitLocaleFiles(
+            id -> LocaleMapper.process(
+                id, src, loadSpecialsData(id), getLocaleTransformer(), supplementalData),
+            CURR, LANG, LOCALES, REGION, UNIT, ZONE);
+    }
+
+    private void processBrkitr() {
+        processAndSplitLocaleFiles(
+            id -> BreakIteratorMapper.process(id, src, loadSpecialsData(id)), BRKITR);
+    }
+
+    private void processCollation() {
+        processAndSplitLocaleFiles(
+            id -> CollationMapper.process(id, src, loadSpecialsData(id)), COLL);
+    }
+
+    private void processRbnf() {
+        processAndSplitLocaleFiles(
+            id -> RbnfMapper.process(id, src, loadSpecialsData(id)), RBNF);
+    }
+
+    private void processAndSplitLocaleFiles(
+        Function<String, IcuData> icuFn, IcuLocaleDir... splitDirs) {
+
+        SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
+        Path baseDir = config.getOutputDir();
+
+        for (String id : config.getTargetLocaleIds(LOCALES)) {
+            // Skip "target" IDs that are aliases (they are handled later).
+            if (!availableIds.contains(id)) {
+                continue;
+            }
+            IcuData icuData = icuFn.apply(id);
+
+            ListMultimap<IcuLocaleDir, RbPath> splitPaths = LinkedListMultimap.create();
+            for (RbPath p : icuData.getPaths()) {
+                String rootName = getBaseSegmentName(p.getSegment(0));
+                splitPaths.put(LOCALE_SPLIT_INFO.getOrDefault(rootName, LOCALES), p);
+            }
+
+            // We always write base languages (even if empty).
+            boolean isBaseLanguage = !id.contains("_");
+            // Run through all directories (not just the keySet() of the split path map) since we
+            // sometimes write empty files.
+            for (IcuLocaleDir dir : splitDirs) {
+                Set<String> targetIds = config.getTargetLocaleIds(dir);
+                if (!targetIds.contains(id)) {
+                    if (!splitPaths.get(dir).isEmpty()) {
+                        System.out.format(
+                            "target IDs for %s does not contain %s, but it has data: %s\n",
+                            dir, id, splitPaths.get(dir));
+                    }
+                    continue;
+                }
+                Path outDir = baseDir.resolve(dir.getOutputDir());
+                IcuData splitData = new IcuData(icuData.getName(), icuData.hasFallback());
+                // The split data can still be empty for this directory, but that's expected.
+                splitPaths.get(dir).forEach(p -> splitData.add(p, icuData.get(p)));
+                // Adding a parent locale makes the data non-empty and forces it to be written.
+                supplementalData.getExplicitParentLocaleOf(splitData.getName())
+                    .ifPresent(p -> splitData.add(RB_PARENT, p));
+                if (!splitData.isEmpty() || isBaseLanguage || dir.includeEmpty()) {
+                    splitData.setVersion(CldrDataSupplier.getCldrVersionString());
+                    write(splitData, outDir);
+                    writtenLocaleIds.put(dir, id);
+                }
+            }
+        }
+
+        for (IcuLocaleDir dir : splitDirs) {
+            Path outDir = baseDir.resolve(dir.getOutputDir());
+            Set<String> targetIds = config.getTargetLocaleIds(dir);
+
+            Map<String, String> aliasMap = getAliasMap(targetIds, dir);
+            aliasMap.forEach((s, t) -> {
+                // It's only important to record which alias files are written because of forced
+                // aliases, but since it's harmless otherwise, we just do it unconditionally.
+                // Normal alias files don't affect the empty file calculation, but forced ones can.
+                writtenLocaleIds.put(dir, s);
+                writeAliasFile(s, t, outDir);
+            });
+
+            calculateEmptyFiles(writtenLocaleIds.get(dir), aliasMap.values())
+                .forEach(id -> writeEmptyFile(id, outDir, aliasMap.values()));
+        }
+    }
+
+    private Map<String, String> getAliasMap(Set<String> localeIds, IcuLocaleDir dir) {
+        // There are four reasons for treating a locale ID as an alias.
+        // 1: It contains deprecated subtags (e.g. "sr_YU", which should be "sr_Cyrl_RS").
+        // 2: It has no CLDR data but is missing a script subtag.
+        // 3: It is one of the special "phantom" alias which cannot be represented normally
+        //    and must be manually mapped (e.g. legacy locale IDs which don't even parse).
+        // 4: It is a "super special" forced alias, which might replace existing aliases in
+        //    some output directories.
+        Map<String, String> aliasMap = new LinkedHashMap<>();
+        for (String id : localeIds) {
+            if (PHANTOM_ALIASES.keySet().contains(id)) {
+                checkArgument(!availableIds.contains(id),
+                    "phantom aliases should never be otherwise supported: %s\n"
+                        + "(maybe the phantom alias can now be removed?)", id);
+                aliasMap.put(id, PHANTOM_ALIASES.get(id));
+                continue;
+            }
+            String canonicalId = supplementalData.replaceDeprecatedTags(id);
+            if (!canonicalId.equals(id)) {
+                // If the canonical form of an ID differs from the requested ID, the this is an
+                // alias, and just needs to point to the canonical ID.
+                aliasMap.put(id, canonicalId);
+                continue;
+            }
+            if (availableIds.contains(id)) {
+                // If it's canonical and supported, it's not an alias.
+                continue;
+            }
+            // If the requested locale is not supported, maximize it and alias to that.
+            String maximizedId = supplementalData.maximize(id)
+                .orElseThrow(() -> new IllegalArgumentException("unsupported locale ID: " + id));
+            // We can't alias to ourselves and we shouldn't be here is the ID was already maximal.
+            checkArgument(!maximizedId.equals(id), "unsupported maximized locale ID: %s", id);
+            aliasMap.put(id, maximizedId);
+        }
+        // Important that we overwrite entries which might already exist here, since we might have
+        // already calculated a "natural" alias for something that we want to force (and we should
+        // replace the existing target, since that affects how we determine empty files later).
+        aliasMap.putAll(config.getForcedAliases(dir));
+        return aliasMap;
+    }
+
+    private static final CharMatcher PATH_MODIFIER = CharMatcher.anyOf(":%");
+
+    // Resource bundle paths elements can have variants (e.g. "Currencies%narrow) or type
+    // annotations (e.g. "languages:intvector"). We strip these when considering the element name.
+    private static String getBaseSegmentName(String segment) {
+        int idx = PATH_MODIFIER.indexIn(segment);
+        return idx == -1 ? segment : segment.substring(0, idx);
+    }
+
+    private void processDayPeriods(String dir) {
+        write(DayPeriodsMapper.process(src), dir);
+    }
+
+    private void processPlurals(String dir) {
+        write(PluralsMapper.process(src), dir);
+    }
+
+    private void processPluralRanges(String dir) {
+        write(PluralRangesMapper.process(src), dir);
+    }
+
+    private void processKeyTypeData(String dir) {
+        Bcp47Mapper.process(src).forEach(d -> write(d, dir));
+    }
+
+    private void processTransforms(String dir) {
+        Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
+        write(TransformsMapper.process(src, transformDir), transformDir);
+    }
+
+    private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
+
+    private void processSupplemental(
+        String label, PathMatcher paths, String dir, boolean addCldrVersion) {
+        IcuData icuData =
+            SupplementalMapper.process(src, getSupplementalTransformer(), label, paths);
+        // A hack for "supplementalData.txt" since the "cldrVersion" value doesn't come from the
+        // supplemental data XML files.
+        if (addCldrVersion) {
+            icuData.add(RB_CLDR_VERSION, CldrDataSupplier.getCldrVersionString());
+        }
+        write(icuData, dir);
+    }
+
+    private void writeAliasFile(String srcId, String destId, Path dir) {
+        IcuData icuData = new IcuData(srcId, true);
+        icuData.add(RB_ALIAS, destId);
+        write(icuData, dir);
+    }
+
+    private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
+        IcuData icuData = new IcuData(id, true);
+        // TODO: Document the reason for this (i.e. why does it matter what goes into empty files?)
+        if (aliasTargets.contains(id)) {
+            icuData.setFileComment("generated alias target");
+            icuData.add(RB_EMPTY_ALIAS, "");
+        } else {
+            // These empty files only exist because the target of an alias has a parent locale
+            // which is itself not in the set of written ICU files. An "indirect alias target".
+            icuData.setVersion(CldrDataSupplier.getCldrVersionString());
+        }
+        write(icuData, dir);
+    }
+
+    private void write(IcuData icuData, String dir) {
+        write(icuData, config.getOutputDir().resolve(dir));
+    }
+
+    private void write(IcuData icuData, Path dir) {
+        createDirectory(dir);
+        IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
+    }
+
+    private Path createDirectory(Path dir) {
+        try {
+            Files.createDirectories(dir);
+        } catch (IOException e) {
+            throw new RuntimeException("cannot create directory: " + dir, e);
+        }
+        return dir;
+    }
+
+    // The set of IDs to process is:
+    // * any file that was written
+    // * any alias target (not written)
+    //
+    // From which we generate the complete "closure" under the "getParent()" function. This set
+    // contains all file (written or not) which need to exist to complete the locale hierarchy.
+    //
+    // Then we remove all the written files to just leave the ones that need to be generated.
+    // This is a simple and robust approach that handles things like "gaps" in non-aliased
+    // locale IDs, where an intermediate parent is not present.
+    private ImmutableSet<String> calculateEmptyFiles(
+        Set<String> writtenIds, Collection<String> aliasTargetIds) {
+
+        Set<String> seedIds = new HashSet<>(writtenIds);
+        seedIds.addAll(aliasTargetIds);
+        // Be nice and sort the output (makes easier debugging).
+        Set<String> allIds = new TreeSet<>();
+        for (String id : seedIds) {
+            while (!id.equals("root") && !allIds.contains(id)) {
+                allIds.add(id);
+                id = supplementalData.getParent(id);
+            }
+        }
+        return ImmutableSet.copyOf(Sets.difference(allIds, writtenIds));
+    }
+
+    private static final ImmutableMap<String, IcuLocaleDir> LOCALE_SPLIT_INFO =
+        ImmutableMap.<String, IcuLocaleDir>builder()
+            // BRKITR
+            .put("boundaries", BRKITR)
+            .put("dictionaries", BRKITR)
+            .put("exceptions", BRKITR)
+            // COLL
+            .put("collations", COLL)
+            .put("depends", COLL)
+            .put("UCARules", COLL)
+            // CURR
+            .put("Currencies", CURR)
+            .put("CurrencyPlurals", CURR)
+            .put("CurrencyUnitPatterns", CURR)
+            .put("currencySpacing", CURR)
+            // LANG
+            .put("Keys", LANG)
+            .put("Languages", LANG)
+            .put("Scripts", LANG)
+            .put("Types", LANG)
+            .put("Variants", LANG)
+            .put("characterLabelPattern", LANG)
+            .put("codePatterns", LANG)
+            .put("localeDisplayPattern", LANG)
+            // RBNF
+            .put("RBNFRules", RBNF)
+            // REGION
+            .put("Countries", REGION)
+            // UNIT
+            .put("durationUnits", UNIT)
+            .put("units", UNIT)
+            .put("unitsShort", UNIT)
+            .put("unitsNarrow", UNIT)
+            // ZONE
+            .put("zoneStrings", ZONE)
+            .build();
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java

new file mode 100644 (file)

index 0000000..97b1048
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java
@@ -0,0 +1,106 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import java.nio.file.Path;
+import java.util.Map;
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrDraftStatus;
+
+import com.google.common.base.Ascii;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
+
+/** API for configuring the LDML converter. */
+public interface LdmlConverterConfig {
+    /** Output directories for ICU locale data (this is not used for supplemental data). */
+    enum IcuLocaleDir {
+        /** Data for the break-iterator library. */
+        BRKITR(true),
+        /** Data for the collations library. */
+        COLL(true),
+        /** Currency data. */
+        CURR(false),
+        /** Language data. */
+        LANG(false),
+        /** General locale data. */
+        LOCALES(true),
+        /** Rule-based number formatter data. */
+        RBNF(true),
+        /** Region data. */
+        REGION(false),
+        /** Measurement and units data. */
+        UNIT(false),
+        /** Timezone data. */
+        ZONE(false);
+
+        private final String dirName = Ascii.toLowerCase(name());
+        private final boolean includeEmpty;
+
+        IcuLocaleDir(boolean includeEmpty) {
+            this.includeEmpty = includeEmpty;
+        }
+
+        /** Returns the relative output directory name. */
+        String getOutputDir() {
+            return dirName;
+        }
+
+        /**
+         * Whether the directory is expected to contain empty data files (used to advertise
+         * the supported set of locales for the "service" provided by the data in that
+         * directory).
+         */
+        // TODO: Document why there's a difference between directories for empty directories.
+        boolean includeEmpty() {
+            return includeEmpty;
+        }
+    }
+
+    /**
+     * Returns the set of output types to be converted. Use {@link OutputType#ALL} to convert
+     * everything.
+     */
+    Set<OutputType> getOutputTypes();
+
+    /** Returns the root directory in which the CLDR release is located. */
+    Path getCldrDirectory();
+
+    /**
+     * Returns an additional "specials" directory containing additional ICU specific XML
+     * files depending on the given output type. This is where the converter finds any XML
+     * files using the "icu:" namespace.
+     */
+    Path getSpecialsDir();
+
+    /**
+     * Returns the root of the ICU output directory hierarchy into which ICU data file are
+     * written.
+     */
+    Path getOutputDir();
+
+    /** Returns the minimal draft status for CLDR data to be converted. */
+    CldrDraftStatus getMinimumDraftStatus();
+
+    /**
+     * Returns the set of locale IDs to be processed for the given directory.
+     *
+     * <p>This set can contain IDs which have noICU data associated with them if they are
+     * suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
+     * exist).
+     */
+    Set<String> getTargetLocaleIds(IcuLocaleDir dir);
+
+    /**
+     * Return a map of locale IDs which specifies aliases which are applied to the given
+     * directory in contradiction to the natural alias or parent ID which would otherwise
+     * be generated. This is a mechanism for restructuring the parent chain and linking
+     * locales together in non-standard and unexpected ways.
+     */
+    Map<String, String> getForcedAliases(IcuLocaleDir dir);
+
+    /**
+     * Whether to emit a summary report for debug purposes after conversion is complete.
+     */
+    boolean emitReport();
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java

new file mode 100644 (file)

index 0000000..e6e8e4d
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java
@@ -0,0 +1,259 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkPositionIndex;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.function.Predicate;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * An immutable matcher for {@link CldrPath} instances. A path matcher specification looks like
+ * {@code "foo/*[@x="z"]/bar[@y=*]"}, where element names and attribute values can be wildcards.
+ *
+ * <p>Note that the path fragment represented by the specification does not include either leading
+ * or trailing {@code '/'}. This is because matching can occur at any point in a {@code CdlrPath}.
+ * The choice of where to match in the path is governed by the match method used (e.g.
+ * {@link PathMatcher#matchesSuffixOf(CldrPath)}.
+ */
+public abstract class PathMatcher {
+    /** Parses the path specification into a matcher. */
+    public static PathMatcher of(String pathSpec) {
+        // Supported so far: "a", "a/b", "a/b[@x=*]"
+        return new BasicMatcher(parse(pathSpec));
+    }
+
+    /**
+     * Combines the given matchers into a single composite matcher which tests all the given
+     * matchers in order.
+     */
+    public static PathMatcher anyOf(PathMatcher... matchers) {
+        checkArgument(matchers.length > 0, "must supply at least one matcher");
+        if (matchers.length == 1) {
+            return checkNotNull(matchers[0]);
+        }
+        return new CompositeMatcher(ImmutableList.copyOf(matchers));
+    }
+
+    /** Attempts a full match against a given path. */
+    public abstract boolean matches(CldrPath path);
+
+    /** Attempts a suffix match against a given path. */
+    public abstract boolean matchesSuffixOf(CldrPath path);
+
+    /** Attempts a prefix match against a given path. */
+    public abstract boolean matchesPrefixOf(CldrPath path);
+
+    // A matcher that simply combines a sequences of other matchers in order.
+    private static final class CompositeMatcher extends PathMatcher {
+        private final ImmutableList<PathMatcher> matchers;
+
+        private CompositeMatcher(ImmutableList<PathMatcher> matchers) {
+            checkArgument(matchers.size() > 1);
+            this.matchers = checkNotNull(matchers);
+        }
+
+        @Override
+        public boolean matches(CldrPath path) {
+            for (PathMatcher m : matchers) {
+                if (m.matches(path)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        @Override
+        public boolean matchesSuffixOf(CldrPath path) {
+            for (PathMatcher m : matchers) {
+                if (m.matchesSuffixOf(path)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        @Override
+        public boolean matchesPrefixOf(CldrPath path) {
+            for (PathMatcher m : matchers) {
+                if (m.matchesPrefixOf(path)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }
+
+    private static final class BasicMatcher extends PathMatcher {
+        private final ImmutableList<Predicate<CldrPath>> elementMatchers;
+
+        private BasicMatcher(List<Predicate<CldrPath>> elementMatchers) {
+            this.elementMatchers = ImmutableList.copyOf(elementMatchers);
+        }
+
+        @Override
+        public boolean matches(CldrPath path) {
+            return elementMatchers.size() == path.getLength() && matchRegion(path, 0);
+        }
+
+        @Override
+        public boolean matchesSuffixOf(CldrPath path) {
+            int start = path.getLength() - elementMatchers.size();
+            return start >= 0 && matchRegion(path, start);
+        }
+
+        @Override
+        public boolean matchesPrefixOf(CldrPath path) {
+            return path.getLength() >= elementMatchers.size() && matchRegion(path, 0);
+        }
+
+        private boolean matchRegion(CldrPath path, int offset) {
+            // offset is the path element corresponding the the "top most" element matcher, it
+            // must be in the range 0 ... (path.length() - elementMatchers.size()).
+            checkPositionIndex(offset, path.getLength() - elementMatchers.size());
+            // First jump over the path parents until we find the last matcher.
+            int matchPathLength = offset + elementMatchers.size();
+            while (path.getLength() > matchPathLength) {
+                path = path.getParent();
+            }
+            return matchForward(path, elementMatchers.size() - 1);
+        }
+
+        private boolean matchForward(CldrPath path, int matcherIndex) {
+            if (matcherIndex < 0) {
+                return true;
+            }
+            return matchForward(path.getParent(), matcherIndex - 1)
+                && elementMatchers.get(matcherIndex).test(path);
+        }
+    }
+
+    // Make a new, non-interned, unique instance here which we can test by reference to
+    // determine if the argument is to be captured (needed as ImmutableMap prohibits null).
+    // DO NOT change this code to assign "*" as the value directly, it MUST be a new instance.
+    private static final String WILDCARD = new String("*");
+
+    private static final Pattern ELEMENT_START_REGEX =
+        Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)");
+    private static final Pattern ATTRIBUTE_REGEX =
+        Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")\\]");
+
+    // element := foo, foo[@bar="baz"], foo[@bar=*]
+    // pathspec := element{/element}*
+    private static List<Predicate<CldrPath>> parse(String pathSpec) {
+        List<Predicate<CldrPath>> specs = new ArrayList<>();
+        int pos = 0;
+        do {
+            pos = parse(pathSpec, pos, specs);
+        } while (pos >= 0);
+        return specs;
+    }
+
+    // Return next start index or -1.
+    private static int parse(String pathSpec, int pos, List<Predicate<CldrPath>> specs) {
+        Matcher m = ELEMENT_START_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
+        checkArgument(m.lookingAt(), "invalid path specification (index=%s): %s", pos, pathSpec);
+        String name = m.group(1);
+        Map<String, String> attributes = ImmutableMap.of();
+        pos = m.end(1);
+        if (pos < pathSpec.length() && pathSpec.charAt(pos) == '[') {
+            // We have attributes to add.
+            attributes = new LinkedHashMap<>();
+            do {
+                m = ATTRIBUTE_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
+                checkArgument(m.lookingAt(),
+                    "invalid path specification (index=%s): %s", pos, pathSpec);
+                // Null if we matched the '*' wildcard.
+                String value = m.group(2);
+                attributes.put(m.group(1), value != null ? value : WILDCARD);
+                pos = m.end();
+            } while (pos < pathSpec.length() && pathSpec.charAt(pos) == '[');
+        }
+        // Wildcard matching is less efficient because attribute keys cannot be made in advance, so
+        // since it's also very rare, we special case it.
+        Predicate<CldrPath> matcher = name.equals(WILDCARD)
+            ? new WildcardElementMatcher(attributes)::match
+            : new ElementMatcher(name, attributes)::match;
+        specs.add(matcher);
+        if (pos == pathSpec.length()) {
+            return -1;
+        }
+        checkState(pathSpec.charAt(pos) == '/',
+            "invalid path specification (index=%s): %s", pos, pathSpec);
+        return pos + 1;
+    }
+
+    // Matcher for path elements like "foo[@bar=*]" where the name is known in advance.
+    private static final class ElementMatcher  {
+        private final String name;
+        private final ImmutableMap<AttributeKey, String> attributes;
+
+        private ElementMatcher(String name, Map<String, String> attributes) {
+            this.name = checkNotNull(name);
+            this.attributes = attributes.entrySet().stream()
+                .collect(toImmutableMap(e -> keyOf(name, e.getKey()), Entry::getValue));
+        }
+
+        boolean match(CldrPath path) {
+            if (!path.getName().equals(name)) {
+                return false;
+            }
+            for (Entry<AttributeKey, String> e : attributes.entrySet()) {
+                String actual = path.get(e.getKey());
+                if (actual == null) {
+                    return false;
+                }
+                String expected = e.getValue();
+                // DO NOT change this to use expected.equals(WILDCARD).
+                if (expected != WILDCARD && !expected.equals(actual)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+
+    // Matcher for path elements like "*[@bar=*]", where the name isn't known until match time.
+    private static final class WildcardElementMatcher  {
+        private final ImmutableMap<String, String> attributes;
+
+        private WildcardElementMatcher(Map<String, String> attributes) {
+            this.attributes = ImmutableMap.copyOf(attributes);
+        }
+
+        private boolean match(CldrPath path) {
+            // The wildcard matcher never fails due to the element name but must create new key
+            // instances every time matching occurs (because the key name is dynamic). Since this
+            // is rare, it's worth making into a separate case.
+            for (Entry<String, String> attribute : attributes.entrySet()) {
+                String actual = path.get(keyOf(path.getName(), attribute.getKey()));
+                if (actual == null) {
+                    return false;
+                }
+                String expected = attribute.getValue();
+                // DO NOT change this to use expected.equals(WILDCARD).
+                if (expected != WILDCARD && !expected.equals(actual)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathValueTransformer.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathValueTransformer.java

new file mode 100644 (file)

index 0000000..d5075fa
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathValueTransformer.java
@@ -0,0 +1,130 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.function.Function;
+
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * API for transforming CLDR path/value pairs. Transformed results support grouping by their key
+ * and the ability to generate default "fallback" values to account for missing values in a group.
+ *
+ * <p>To transform some set of CLDR path/values:
+ * <ol>
+ * <li>Transform all desired path/value pairs into a set of matched results, discarding duplicates
+ * (see {@link #transform(CldrValue)}.
+ * <li>Group the results by key (e.g. into a {@code ListMultimap}).
+ * <li>For each group, add any fallback values which don't yet exist for that key (see
+ * {@link #getFallbackResultsFor(RbPath, DynamicVars)} and {@link Result#isFallbackFor(Result)}).
+ * <li>Sort elements within each group and flatten result values (see {@link Result#isGrouped()}).
+ * </ol>
+ *
+ * <p>For each unique key, this should yield correctly ordered sequence of values (according to the
+ * semantics of the chosen transformer implementation).
+ */
+public abstract class PathValueTransformer {
+    /**
+     * A result either obtained by transforming a path/value pair, or as a potential fallback for
+     * some known key (see {@link PathValueTransformer#transform(CldrValue)} and
+     * {@link PathValueTransformer#getFallbackResultsFor(RbPath, DynamicVars)}).
+     */
+    public static abstract class Result implements Comparable<Result> {
+        private final RbPath key;
+
+        protected Result(RbPath key) {
+            this.key = checkNotNull(key);
+        }
+
+        /**
+         * Returns the key of this result, used to group results and determine fallback values
+         * according to the semantics of the chosen transformer.
+         */
+        public RbPath getKey() {
+            return key;
+        }
+
+        /**
+         * Returns whether the values in this result should be grouped or not. Un-grouped values
+         * should be considered as individual values in a sequence and might be joined with values
+         * from other results in the same group. Grouped values cannot be split and must appear
+         * as a single value.
+         *
+         * <p>For example for the ordered results:
+         * <pre>
+         * Result X = { key=K, values=[ "a", "b" ], grouped=false }
+         * Result Y = { key=K, values=[ "c", "d" ], grouped=false }
+         * Result Z = { key=K, values=[ "e" ], grouped=false }
+         * </pre>
+         * the values for key {@code K} are conceptually {@code [ "a", "b", "c", "d", "e" ]}.
+         *
+         * <p>However if result {@code Y} has {@code grouped=true} then there are now 4 values
+         * {@code [ "a", "b", ["c", "d"], "e" ]}, and if {@code X} is also grouped, then it is
+         * {@code [ ["a", "b"], ["c", "d"], "e" ]}, producing only 3 top-level values.
+         */
+        public abstract boolean isGrouped();
+
+        /**
+         * Returns the transformed values of this result, which may or may not be grouped
+         * according to {@link #isGrouped()}.
+         */
+        public abstract ImmutableList<String> getValues();
+
+        /**
+         * Returns whether this result is a fallback for some existing matched result. Fallback
+         * results should only be used when it is not a fallback for any existing result.
+         */
+        public abstract boolean isFallbackFor(Result r);
+
+        /** Debug only string representation. */
+        @Override
+        public final String toString() {
+            return String.format(
+                "Result{ key='%s', grouped=%s, values=%s }",
+                getKey(), isGrouped(), getValues());
+        }
+    }
+
+    /**
+     * A "typedef" for the function to do late binding of dynamic variables. This is used for edge
+     * cases where a %N variable in the rules config is bound to a CLDR path (e.g. "//foo/bar")
+     * which cannot be resolved until the rule is evaluated. Unfortunately the need to support late
+     * binding of variables incurs significant additional complexity in the code, despite being
+     * used in exactly one situation so far (the '%D' variable to represent the default numbering
+     * scheme.
+     */
+    // TODO: Figure out how to get rid of all of this mess.
+    public interface DynamicVars extends Function<CldrPath, String> {}
+
+    /**
+     * Transforms a CLDR value into a sequence of results (empty if the value was not matched by
+     * any rule).
+     *
+     * @param cldrValue the value to transform.
+     * @return the transformed result(s).
+     */
+    public abstract ImmutableList<Result> transform(CldrValue cldrValue);
+
+    /**
+     * Transforms a CLDR value into a sequence of results (empty if the value was not matched by
+     * any rule). The dynamic variable function provides any "late bound" CLDR path variables to be
+     * resolved from CLDR data during processing (e.g "%D=//ldml/numbers/defaultNumberingSystem").
+     *
+     * @param cldrValue the value to transform.
+     * @param varFn a function for resolving "late bound" variables.
+     * @return the transformed result(s).
+     */
+    public abstract ImmutableList<Result> transform(CldrValue cldrValue, DynamicVars varFn);
+
+    /**
+     * Returns a possibly empty sequence of fallback results for a given key. A fallback result for
+     * a key should be used only if it is not a fallback for any other result with that key; see
+     * also {@link Result#isFallbackFor(Result)}.
+     */
+    public abstract ImmutableList<Result> getFallbackResultsFor(RbPath key, DynamicVars varFn);
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java

new file mode 100644 (file)

index 0000000..3af37b5
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java
@@ -0,0 +1,232 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Objects;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Comparators;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+/**
+ * A resource bundle path, used to identify entries in ICU data.
+ *
+ * <p>Immutable and thread safe.
+ */
+public final class RbPath implements Comparable<RbPath> {
+    private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults();
+
+    // This defines ordering of paths in IcuData instances and thus the order in ICU data files.
+    // If there's ever a reason to have a different "natural" order for paths, this Comparator
+    // should be moved into the ICU file writer class(es).
+    private static final Comparator<RbPath> ORDERING =
+        Comparator.comparing(
+            p -> p.segments,
+            Comparators.lexicographical(Comparator.<String>naturalOrder()));
+
+    // Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much
+    // faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs).
+    private static final CharMatcher INVARIANT_CHARS =
+        CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate());
+
+    // Note that we must also prohibit double-quote from appearing anywhere other than surrounding
+    // segment values. This is because some segment values can contain special ICU data characters
+    // (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU
+    // data for key values (since '\' is not an invariant, things like \\uxxxx are not possible).
+    //
+    // Ideally quoting would be done when the file is written, but that would require additional
+    // complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow
+    // be distinguished from timezone "metazone" names which also contain ':'.
+    private static final CharMatcher QUOTED_SEGMENT_CHARS =
+        INVARIANT_CHARS
+            .and(CharMatcher.javaIsoControl().negate())
+            .and(CharMatcher.isNot('"'));
+    private static final CharMatcher UNQUOTED_SEGMENT_CHARS =
+        QUOTED_SEGMENT_CHARS.and(whitespace().negate());
+
+    // Characters allowed in path segments which separate the "base name" from any suffix (e.g.
+    // the base name of "Foo:intvector" is "Foo").
+    private static final CharMatcher SEGMENT_SEPARATORS = CharMatcher.anyOf("%:");
+
+    private static final RbPath EMPTY = new RbPath(ImmutableList.of());
+
+    public static RbPath empty() {
+        return EMPTY;
+    }
+
+    public static RbPath of(String... segments) {
+        return of(Arrays.asList(segments));
+    }
+
+    public static RbPath of(Iterable<String> segments) {
+        return new RbPath(segments);
+    }
+
+    public static RbPath parse(String path) {
+        checkArgument(!path.isEmpty(), "cannot parse an empty path string");
+        // Allow leading '/', but don't allow empty segments anywhere else.
+        if (path.startsWith("/")) {
+            path = path.substring(1);
+        }
+        return new RbPath(PATH_SPLITTER.split(path));
+    }
+
+    static int getCommonPrefixLength(RbPath lhs, RbPath rhs) {
+        int maxLength = Math.min(lhs.length(), rhs.length());
+        int n = 0;
+        while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) {
+            n++;
+        }
+        return n;
+    }
+
+    private final ImmutableList<String> segments;
+    private final int hashCode;
+
+    private RbPath(Iterable<String> segments) {
+        this.segments = ImmutableList.copyOf(segments);
+        this.hashCode = Objects.hash(this.segments);
+        for (String segment : this.segments) {
+            checkArgument(!segment.isEmpty(),
+                "empty path segments not permitted: %s", this.segments);
+            // Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only
+            // contain double quotes at either end, or not at all. If the string is quoted, only
+            // validate the content, and not the quotes themselves.
+            String toValidate;
+            switch (segment.charAt(0)) {
+            case '<':
+                // Allow anything in hidden labels, since they will be removed later and never
+                // appear in the final ICU data.
+                checkArgument(segment.endsWith(">"),
+                    "mismatched quoting for hidden label: %s", segment);
+                continue;
+
+            case '"':
+                checkArgument(segment.endsWith("\""),
+                    "mismatched quoting for segment: %s", segment);
+                checkArgument(
+                    QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)),
+                    "invalid character in unquoted resource bundle path segment: %s", segment);
+                break;
+
+            default:
+                checkArgument(
+                    UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment),
+                    "invalid character in unquoted resource bundle path segment: %s", segment);
+                break;
+            }
+        }
+    }
+
+    public int length() {
+        return segments.size();
+    }
+
+    public String getSegment(int n) {
+        return segments.get(n);
+    }
+
+    public RbPath getParent() {
+        checkState(length() > 0, "cannot get parent of the empty path");
+        return length() > 1 ? new RbPath(segments.subList(0, length() - 1)) : EMPTY;
+    }
+
+    public boolean isAnonymous() {
+        return length() > 0 && segments.get(length() - 1).charAt(0) == '<';
+    }
+
+    public RbPath extendBy(String... parts) {
+        return new RbPath(Iterables.concat(segments, Arrays.asList(parts)));
+    }
+
+    public RbPath extendBy(RbPath suffix) {
+        return new RbPath(Iterables.concat(segments, suffix.segments));
+    }
+
+    public RbPath mapSegments(Function<? super String, String> fn) {
+        return new RbPath(segments.stream().map(fn).collect(toImmutableList()));
+    }
+
+    /**
+     * Returns whether the first element of this path is prefix by the given "base name".
+     *
+     * <p>Resource bundle paths relating to semantically similar data are typically grouped by the
+     * same first path element. This is not as simple as just comparing the first element, as in
+     * {@code path.startsWith(prefix)} however, since path elements can have suffixes, such as
+     * {@code "Foo:alias"} or {@code "Foo%subtype"}.
+     *
+     * @param baseName the base name to test for.
+     * @return true is the "base name" of the first path element is the given prefix.
+     */
+    public boolean hasPrefix(String baseName) {
+        checkArgument(!baseName.isEmpty() && SEGMENT_SEPARATORS.matchesNoneOf(baseName));
+        if (length() == 0) {
+            return false;
+        }
+        String firstElement = getSegment(0);
+        // Slightly subtle (but safe) access to the separator character, since:
+        // (!a.equals(b) && a.startsWith(b)) ==> a.length() > b.length().
+        return firstElement.equals(baseName)
+            || (firstElement.startsWith(baseName)
+                && SEGMENT_SEPARATORS.matches(firstElement.charAt(baseName.length())));
+    }
+
+    public boolean startsWith(RbPath prefix) {
+        return prefix.length() <= length() && matchesSublist(prefix, 0);
+    }
+
+    public boolean endsWith(RbPath suffix) {
+        return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length());
+    }
+
+    public boolean contains(RbPath path) {
+        int maxOffset = length() - path.length();
+        for (int i = 0; i <= maxOffset; i++) {
+            if (matchesSublist(path, i)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Assume length check has been done.
+    private boolean matchesSublist(RbPath path, int offset) {
+        for (int i = 0; i < path.length(); i++) {
+            if (!path.getSegment(i).equals(getSegment(i + offset))) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    boolean isIntPath() {
+        String lastElement = segments.get(segments.size() - 1);
+        return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
+    }
+
+    @Override public int compareTo(RbPath other) {
+        return ORDERING.compare(this, other);
+    }
+
+    @Override public boolean equals(Object other) {
+        return (other instanceof RbPath) && segments.equals(((RbPath) other).segments);
+    }
+
+    @Override public int hashCode() {
+        return hashCode;
+    }
+
+    @Override public String toString() {
+        return String.join("/", segments);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java

new file mode 100644 (file)

index 0000000..84751d4
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java
@@ -0,0 +1,58 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.Function;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * A resource bundle value containing a sequence of elements. This is a very thin wrapper over an
+ * immutable list, with a few additional constraints (e.g. cannot be empty).
+ *
+ * <p>Immutable and thread safe.
+ */
+public final class RbValue {
+    private final ImmutableList<String> elements;
+
+    /** Returns a resource bundle value of the given elements. */
+    public static RbValue of(String... elements) {
+        return of(Arrays.asList(elements));
+    }
+
+    /** Returns a resource bundle value of the given elements. */
+    public static RbValue of(Iterable<String> elements) {
+        return new RbValue(elements);
+    }
+
+    private RbValue(Iterable<String> elements) {
+        this.elements = ImmutableList.copyOf(elements);
+        checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty");
+    }
+
+    /** Returns the (non zero) number of elements in this value. */
+    public int size() {
+        return elements.size();
+    }
+
+    /** Returns the Nth element of this value. */
+    public String getElement(int n) {
+        return elements.get(n);
+    }
+
+    @Override public int hashCode() {
+        return Objects.hashCode(elements);
+    }
+
+    @Override public boolean equals(Object obj) {
+        return obj instanceof  RbValue && elements.equals(((RbValue) obj).elements);
+    }
+
+    @Override public String toString() {
+        return elements.toString();
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java

new file mode 100644 (file)

index 0000000..954ebe0
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java
@@ -0,0 +1,593 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static java.util.function.Function.identity;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+
+import com.google.common.base.Ascii;
+import com.google.common.base.Splitter;
+import com.google.common.base.Strings;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableTable;
+import com.google.common.collect.Table;
+
+/**
+ * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
+ * in some mapper classes.
+ *
+ * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
+ * build using the same underlying CLDR data. The only reason mapper classes do not create their
+ * own instances directly is the relative cost of processing all the supplemental data each time.
+ */
+// TODO: This should be moved into the API and leverage some of the existing utility functions.
+public final class SupplementalData {
+    private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
+
+    private static final PathMatcher ALIAS =
+        PathMatcher.of("supplementalData/metadata/alias/*[@type=*]");
+
+    private static final PathMatcher PARENT_LOCALE =
+        PathMatcher.of("supplementalData/parentLocales/parentLocale[@parent=*]");
+    private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
+    private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
+
+    private static final PathMatcher CALENDER_PREFERENCE =
+        PathMatcher.of("supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
+    private static final AttributeKey CALENDER_TERRITORIES =
+        keyOf("calendarPreference", "territories");
+    private static final AttributeKey CALENDER_ORDERING =
+        keyOf("calendarPreference", "ordering");
+
+    private static final PathMatcher LIKELY_SUBTAGS =
+        PathMatcher.of("supplementalData/likelySubtags/likelySubtag[@from=*]");
+    private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
+    private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
+
+    private static final Splitter LIST_SPLITTER =
+        Splitter.on(whitespace()).omitEmptyStrings();
+
+    // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
+    // a single value (it's structurally always a list, but only territory aliases have a need for
+    // more than one value).
+    private enum Alias {
+        LANGUAGE, SCRIPT, TERRITORY;
+
+        private static final ImmutableMap<String, Alias> TYPE_MAP =
+            Arrays.stream(values())
+                .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
+
+        private final String elementName = Ascii.toLowerCase(name()) + "Alias";
+        final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
+        final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
+
+        static Optional<Alias> forElementName(String name) {
+            return Optional.ofNullable(TYPE_MAP.get(name));
+        }
+    }
+
+    /**
+     * Creates a supplemental data API instance from the given CLDR data.
+     *
+     * @param supplementalData the raw CLDR supplemental data instance.
+     * @return the supplemental data API.
+     */
+    static SupplementalData create(CldrData supplementalData) {
+        Table<Alias, String, String> aliasTable = HashBasedTable.create();
+        Map<String, String> parentLocaleMap = new HashMap<>();
+        Map<String, String> defaultCalendarMap = new HashMap<>();
+        Map<String, String> likelySubtagMap = new HashMap<>();
+
+        supplementalData.accept(
+            ARBITRARY,
+            v -> {
+                if (ALIAS.matches(v.getPath())) {
+                    // Territory alias replacements can be a list of values (e.g. when countries
+                    // break up). We use the first (geo-politically most significant) value. This
+                    // doesn't happen for languages or scripts, but could in theory.
+                    Alias.forElementName(v.getPath().getName()).ifPresent(
+                        alias -> aliasTable.put(
+                            alias,
+                            alias.typeKey.valueFrom(v),
+                            alias.replacementKey.valueFrom(v)));
+                } else if (PARENT_LOCALE.matches(v.getPath())) {
+                    String p = PARENT.valueFrom(v);
+                    LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
+                } else if (CALENDER_PREFERENCE.matches(v.getPath())) {
+                    String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
+                    CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
+                } else if (LIKELY_SUBTAGS.matches(v.getPath())) {
+                    likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
+                }
+            });
+
+        // WARNING: The original mapper code determines the full set of deprecated territories and
+        // then removes the following hard-coded list without any explanation as to why. While this
+        // is presumably to "undeprecate" them for the purposes of the locale processing, there's
+        // no explanation of where this list comes from, and thus no way to maintain it.
+        //
+        // asList("062", "172", "200", "830", "AN", "CS", "QU")
+        //     .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
+        // TODO: Understand and document what on Earth this is all about or delete this comment.
+
+        return new SupplementalData(
+            aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
+    }
+
+    // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
+    // data generation. Because this is mutable, it is thoroughly unsuitable for general use.
+    private static final class LocaleId {
+        // From: https://unicode.org/reports/tr35/#Identifiers
+        // Locale ID is:
+        //   (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
+        //
+        // However in CLDR data, there's always a language (even if it's "und"), and never more
+        // than one variant, so this can be simplified to:
+        //   <language>(_<script>)?(_<region>)?(_<variant>)?
+        //
+        // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
+        //   Note that the specification allows for languages 5-8 characters long, but in reality
+        //   this has never occurred yet, so it's ignored in this code.
+        //
+        // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
+        //   The specification permits any casing for script subtags, but since all the data uses
+        //   the capitalized "Xxxx" form, that's what this code expects.
+        //
+        // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
+        //   identifier (e.g. "001").
+        //
+        // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
+        //   with a digit (this avoids any ambiguity with script subtags). However because ICU
+        //   violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
+        //   merely "longer than 5".
+        //
+        // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
+        // for either '-' or '_').
+        //
+        // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
+        private static final Pattern LOCALE_ID =
+            Pattern.compile("([a-z]{2,3})"
+                + "(?:_([A-Z][a-z]{3}))?"
+                + "(?:_([A-Z]{2}|[0-9]{3}))?"
+                + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
+
+        static LocaleId parse(String localeId) {
+            Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
+            checkArgument(m.matches(), "invalid locale ID: %s", localeId);
+            return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
+        }
+
+        static LocaleId of(String language, String script, String region) {
+            return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
+        }
+
+        // Only the language subtag is non-nullable.
+        private String languageSubtag;
+        private String scriptSubtag;
+        private String regionSubtag;
+        private String variantSubtag;
+
+        String getLanguage() {
+            return languageSubtag;
+        }
+
+        String getScript() {
+            return scriptSubtag;
+        }
+
+        String getRegion() {
+            return regionSubtag;
+        }
+
+        String getVariant() {
+            return variantSubtag;
+        }
+
+        LocaleId setLanguage(String languageSubtag) {
+            checkNotNull(languageSubtag, "language subtag must not be null");
+            checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
+            this.languageSubtag = languageSubtag;
+            return this;
+        }
+
+        LocaleId setScript(String scriptSubtag) {
+            this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
+            return this;
+        }
+
+        LocaleId setRegion(String regionSubtag) {
+            this.regionSubtag = Strings.emptyToNull(regionSubtag);
+            return this;
+        }
+
+        LocaleId setVariant(String variantSubtag) {
+            this.variantSubtag = Strings.emptyToNull(variantSubtag);
+            return this;
+        }
+
+        @Override public String toString() {
+            StringBuilder id = new StringBuilder(languageSubtag);
+            if (scriptSubtag != null) {
+                id.append("_").append(scriptSubtag);
+            }
+            if (regionSubtag != null) {
+                id.append("_").append(regionSubtag);
+            }
+            if (variantSubtag != null) {
+                id.append("_").append(variantSubtag);
+            }
+            return id.toString();
+        }
+
+        @Override public boolean equals(Object o) {
+            if (!(o instanceof LocaleId)) {
+                return false;
+            }
+            LocaleId other = (LocaleId) o;
+            return Objects.equals(languageSubtag, other.languageSubtag)
+                && Objects.equals(scriptSubtag, other.scriptSubtag)
+                && Objects.equals(regionSubtag, other.regionSubtag)
+                && Objects.equals(variantSubtag, other.variantSubtag);
+        }
+
+        @Override public int hashCode() {
+            return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
+        }
+    }
+
+    private final ImmutableTable<Alias, String, String> aliasTable;
+    private final ImmutableMap<String, String> parentLocaleMap;
+    private final ImmutableMap<String, String> defaultCalendarMap;
+    private final ImmutableMap<String, String> likelySubtagMap;
+
+    private SupplementalData(
+        Table<Alias, String, String> aliasTable,
+        Map<String, String> parentLocaleMap,
+        Map<String, String> defaultCalendarMap,
+        Map<String, String> likelySubtagMap) {
+        this.aliasTable = ImmutableTable.copyOf(aliasTable);
+        this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
+        this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
+        this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
+    }
+
+    /**
+     * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
+     */
+    public Optional<String> maximize(String localeId) {
+        return addLikelySubtags(localeId).map(Object::toString);
+    }
+
+    /**
+     * Returns the locale ID with any deprecated elements replaced. This is an
+     * implementation of the algorithm specified in
+     * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
+     * specification</a> but without any "minimizing" of the final result (as happens for
+     * canonicalization in the CLDR tools).
+     */
+    public String replaceDeprecatedTags(String localeId) {
+        if (localeId.equals("root")) {
+            return localeId;
+        }
+        LocaleId id = LocaleId.parse(localeId);
+
+        // ---- LDML Specification ----
+        // If the region subtag matches the type attribute of a territoryAlias element in
+        // Supplemental Data, replace the region subtag with the replacement value, as follows:
+        //
+        // * If there is a single territory in the replacement, use it.
+        // * If there are multiple territories:
+        //   * Look up the most likely territory for the base language code (and script, if there
+        //     is one).
+        //   * If that likely territory is in the list, use it.
+        //   * Otherwise, use the first territory in the list.
+        // ----
+        // However there is a footnote that says:
+        //   Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
+        //   However, there are a small number of cases of multiple territories, so the mappings
+        //   can be precomputed. This results in a faster lookup with a very small subset of the
+        //   likely subtags data.
+        //
+        // Note that (contrary to the order implied by the LDML specification) this step is
+        // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
+        // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
+        // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
+        //
+        // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
+        if (id.getRegion() != null) {
+            String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
+            if (replacementRegions != null) {
+                List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
+                checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
+                if (regions.size() == 1) {
+                    id.setRegion(regions.get(0));
+                } else {
+                    LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
+                    String likelyId = likelySubtagMap.get(key.toString());
+                    if (likelyId == null) {
+                        likelyId = likelySubtagMap.get(key.setScript(null).toString());
+                    }
+                    String likelyRegion =
+                        likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
+                    if (regions.contains(likelyRegion)) {
+                        id.setRegion(likelyRegion);
+                    } else {
+                        id.setRegion(regions.get(0));
+                    }
+                }
+            }
+        }
+
+        // While it's not mentioned in the LDML specification, there is data in the alias table for
+        // replacement scripts (currently it contains exactly one entry with one value). Because
+        // its not clear if this is intended to only be single values or a list (and how to handle
+        // it if it were a list), there's a hard check to ensure it's only ever a single value.
+        if (id.getScript() != null) {
+            String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
+            if (replacementScript != null) {
+                checkArgument(whitespace().matchesNoneOf(replacementScript),
+                    "unexpected list of replacement scripts: %s", replacementScript);
+                id.setScript(replacementScript);
+            }
+        }
+
+        // ---- LDML Specification ----
+        // If the language subtag matches the type attribute of a languageAlias element in
+        // Supplemental Data, replace the language subtag with the replacement value.
+        //
+        // If there are additional subtags in the replacement value, add them to the result, but
+        // only if there is no corresponding subtag already in the tag.
+        // ----
+        // Contrary to the precise wording of the specification, we don't just check the language
+        // subtag, since language aliases can contain script and even region information. Instead
+        // we check the alias table using the same order as defined in subtag maximizing:
+        //
+        // <language>_<script>_<region>
+        // <language>_<region>
+        // <language>_<script>
+        // <language>
+        //
+        // There is no need to check for "und" however since that's not aliased anything, but since
+        // it shares the same code it's harmless to do.
+        resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
+            .ifPresent(resolvedId -> {
+                id.setLanguage(checkNotNull(resolvedId.getLanguage(),
+                     "missing language subtag in language alias: %s", resolvedId));
+                if (id.getScript() == null) {
+                    id.setScript(resolvedId.getScript());
+                }
+                if (id.getRegion() == null) {
+                    id.setRegion(resolvedId.getRegion());
+                }
+                if (id.getVariant() == null) {
+                    id.setVariant(resolvedId.getVariant());
+                }
+            });
+        return id.toString();
+    }
+
+    /**
+     * Returns a suitable default calendar for a given locale if it's different from the default
+     * calendar inferred by the locale's parent.
+     *
+     * <p>Note that since the default calendar data is keyed from territory (region subtag) rather
+     * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
+     * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
+     * handled with hard-code special casing, but should probably be data driven eventually.
+     */
+    public Optional<String> getDefaultCalendar(String localeId) {
+        Optional<String> calendar = getSpecialCaseCalendar(localeId);
+        if (calendar.isPresent()) {
+            return calendar;
+        }
+        String t = territoryOf(localeId);
+        calendar = Optional.ofNullable(defaultCalendarMap.get(t));
+        if (!calendar.isPresent()) {
+            return Optional.empty();
+        }
+        String rootCalendar = defaultCalendarMap.get("001");
+        checkState(!rootCalendar.isEmpty(), "missing root calendar");
+        if (localeId.equals("root")) {
+            return Optional.of(rootCalendar);
+        }
+        // All locales reach "root" eventually, and that maps to territory "001" which
+        // we already know has a value, so this loop *must* exit.
+        String parentCalendar;
+        do {
+            localeId = getParent(localeId);
+            String territory = territoryOf(localeId);
+            parentCalendar = defaultCalendarMap.get(territory);
+        } while (parentCalendar == null);
+        return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
+    }
+
+    // Hack to work around the limitation that CLDR data cannot represent default calendars that
+    // change because of non-territory information. Since this is limited to exactly two cases at
+    // the moment, and is unlikely to be expanded, it's being done directly in code.
+    private Optional<String> getSpecialCaseCalendar(String localeId) {
+        Optional<String> maximized = maximize(localeId);
+        if (maximized.isPresent()) {
+            switch (maximized.get()) {
+            case "ja_Jpan_JP_TRADITIONAL":
+                return Optional.of("japanese");
+            case "th_Thai_TH_TRADITIONAL":
+                return Optional.of("buddhist");
+            }
+        }
+        return Optional.empty();
+    }
+
+    /**
+     * Returns the parent of a non-root locale ID. This is more complex than simple truncation for
+     * two reasons:
+     * <ul>
+     *     <li>There may be an explicit parent locale ID specified in the CLDR data.
+     *     <li>Removal of non-default script subtags makes the parent locale "root" (unless there
+     *         was an explicit parent specified).
+     * </ul>
+     * Note that all valid locale ID parent "chains" must end up at "root" eventually.
+     *
+     * For example (showing parent "chains"):
+     * <ul>
+     *     <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
+     *     <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
+     * </ul>
+     *
+     * @throws IllegalArgumentException if the given locale ID is invalid or "root".
+     */
+    public String getParent(String localeId) {
+        checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
+        // Always defer to an explicit parent locale set in the CLDR data.
+        Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
+        if (explicitParent.isPresent()) {
+            return explicitParent.get();
+        }
+        // Now look for the start of the last ID "part" in order to truncate.
+        int lastPartSeperatorIndex = localeId.lastIndexOf('_');
+        // The parent of a base language ID (e.g. "en" or "fr") is always "root".
+        if (lastPartSeperatorIndex == -1) {
+            return "root";
+        }
+        String parentId = localeId.substring(0, lastPartSeperatorIndex);
+
+        // However, if the script of the locale is what's being truncated and it's NOT the default
+        // script for the language, return "root" as the parent rather than truncating.
+        String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
+        if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
+            return "root";
+        }
+        return !parentId.isEmpty() ? parentId : "root";
+    }
+
+    /**
+     * Returns the explicit parent of a locale ID if specified in the CLDR data.
+     *
+     * Note that this method will not return a value for most locale IDs, since they do not have
+     * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
+     * #getParent(String)}.
+     */
+    public Optional<String> getExplicitParentLocaleOf(String localeId) {
+        return Optional.ofNullable(parentLocaleMap.get(localeId));
+    }
+
+    private String territoryOf(String localeId) {
+        return localeId.equals("root")
+            ? "001"
+            : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
+    }
+
+    private String scriptOf(String localeId) {
+        return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
+    }
+
+    // From: https://unicode.org/reports/tr35/#Likely_Subtags
+    //
+    // Add Likely Subtags
+    // ------------------
+    // Given a source locale X, to return a locale Y where the empty subtags have been filled in
+    // by the most likely subtags. A subtag is called empty if it is a missing script or region
+    // subtag, or it is a base language subtag with the value "und".
+    //
+    // Canonicalize
+    // ------------
+    // Make sure the input locale is in canonical form ...
+    // ...
+    // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
+    //
+    // Note that this implementation does not need to handle "grandfathered" tags.
+    private Optional<LocaleId> addLikelySubtags(String localeId) {
+        if (localeId.equals("root")) {
+            return Optional.empty();
+        }
+
+        LocaleId id = LocaleId.parse(localeId);
+        // ---- LDML Specification ----
+        // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
+        if ("Zzzz".equals(id.getScript())) {
+            id.setScript(null);
+        }
+        if ("ZZ".equals(id.getRegion())) {
+            id.setRegion(null);
+        }
+        // ---- LDML Specification ----
+        // A subtag is called empty if it is a missing script or region subtag, or it is a base
+        // language subtag with the value "und"
+        if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
+            // We are already canonical, so just return.
+            return Optional.of(id);
+        }
+        Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
+        if (!optTags.isPresent()) {
+            return Optional.empty();
+        }
+        LocaleId subtags = optTags.get();
+        checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
+        // Replace "missing" elements in the original ID with likely subtags.
+        if (id.getLanguage().equals("und")) {
+            id.setLanguage(subtags.getLanguage());
+        }
+        if (id.getScript() == null) {
+            id.setScript(checkNotNull(subtags.getScript()));
+        }
+        if (id.getRegion() == null) {
+            id.setRegion(checkNotNull(subtags.getRegion()));
+        }
+        // Language is not "und" and both script and region subtags are set!
+        return Optional.of(id);
+    }
+
+    // From: https://unicode.org/reports/tr35/#Likely_Subtags
+    //
+    // Lookup
+    // ------
+    // Lookup each of the following in order, and stop on the first match:
+    // <language>_<script>_<region>
+    // <language>_<region>
+    // <language>_<script>
+    // <language>
+    // "und"_<script>
+    private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
+        String lang = id.getLanguage();
+        String script = id.getScript();
+        String region = id.getRegion();
+        Stream<LocaleId> candidateIds = Stream.of(
+            LocaleId.of(lang, script, region),
+            LocaleId.of(lang, null, region),
+            LocaleId.of(lang, script, null),
+            LocaleId.of(lang, null, null));
+        // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
+        // its own ("en_Latn_US") which is not intended.
+        if (script != null) {
+            candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
+        }
+        return candidateIds
+            // Remove duplicate IDs (keeps the first one encountered).
+            .distinct()
+            .map(Object::toString)
+            .map(fn)
+            .filter(Objects::nonNull)
+            .findFirst()
+            .map(LocaleId::parse);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java

new file mode 100644 (file)

index 0000000..4d80a69
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java
@@ -0,0 +1,246 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Ascii.toLowerCase;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.BCP47;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Optional;
+import java.util.Set;
+
+import javax.annotation.Nullable;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.Ascii;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Sets;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths
+ * matching:
+ * <pre>{@code
+ *   //ldmlBCP47/keyword/key[@name=*]/type[@name=*]
+ * }</pre>
+ */
+public final class Bcp47Mapper {
+    // Other attributes (e.g. "alias") are value attributes and don't need to be matched here.
+    private static final PathMatcher KEY = PathMatcher.of("ldmlBCP47/keyword/key[@name=*]");
+    private static final AttributeKey KEY_NAME = keyOf("key", "name");
+    private static final AttributeKey KEY_ALIAS = keyOf("key", "alias");
+    private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType");
+
+    private static final PathMatcher TYPE = PathMatcher.of("type[@name=*]");
+    private static final AttributeKey TYPE_NAME = keyOf("type", "name");
+    private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias");
+    private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred");
+
+    // Deprecation of the data is not the same as deprecation of attributes themselves. This
+    // deprecation relates to identifying data which exists, but is not longer the right way to
+    // represent things (which means it can be important for clients to know about).
+    private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated");
+    private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated");
+
+    // Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary
+    // information in the ICU data. If the value is equal to the declared default, it is ignored.
+    // NOTE: The need for hard-coded default values is a hack because there's not nice way (yet)
+    // to determine the default for implicit values via the DTD. Ideally this would be automatic
+    // and the AttributeKey class would be able to have a method like "isDefault(String value)".
+    private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES =
+        ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false");
+
+    private static final RbPath RB_KEYMAP = RbPath.of("keyMap");
+    private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias");
+    private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias");
+    private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias");
+
+    /**
+     * Processes data from the given supplier to generate Timezone and BCP-47 ICU data.
+     *
+     * @param src the CLDR data supplier to process.
+     * @return A list of IcuData instances containing BCP-47 data to be written to files.
+     */
+    public static ImmutableList<IcuData> process(CldrDataSupplier src) {
+        Bcp47Visitor visitor = new Bcp47Visitor();
+        src.getDataForType(BCP47).accept(ARBITRARY, visitor);
+        visitor.addKeyMapValues();
+        return ImmutableList.of(visitor.keyTypeData.icuData, visitor.tzData.icuData);
+    }
+
+    // Outer visitor which handles "key" paths by installing sub-visitor methods to process
+    // each child "type" element. Depending on the key name, values are stored in different
+    // IcuData instances.
+    private static final class Bcp47Visitor implements PrefixVisitor {
+        private final ValueCollector tzData =
+            new ValueCollector(new IcuData("timezoneTypes", false));
+        private final ValueCollector keyTypeData =
+            new ValueCollector(new IcuData("keyTypeData", false));
+
+        // The current key name from the parent path element (set when a prefix is matched).
+        @Nullable private String keyName = null;
+        // A map collecting each key and values as they are visited.
+        // TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack.
+        private Map<String, String> keyMap = new LinkedHashMap<>();
+
+        @Override
+        public void visitPrefixStart(CldrPath prefix, Context ctx) {
+            if (KEY.matches(prefix)) {
+                // Don't inline this since it also sets the field!!
+                keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix));
+
+                // How the data is visited is the same for both timezone and other BCP-47 data,
+                // it's just split into different data files, so we just install a different
+                // instance of the visitor class according to where the data in this sub-hierarchy
+                // should end up.
+                ctx.install(keyName.equals("tz") ? tzData : keyTypeData);
+            }
+        }
+
+        // Post processing to add additional captured attribute values and some special cases.
+        private void addKeyMapValues() {
+            IcuData keyData = keyTypeData.icuData;
+            // Add all the keyMap values into the IcuData file.
+            for (Entry<String, String> kmData : keyMap.entrySet()) {
+                String bcpKey = kmData.getKey();
+                String key = kmData.getValue();
+                if (bcpKey.startsWith("@")) {
+                    // Undoing the weird hack in addInfoAttributes(). This can be done better.
+                    // We use "parse()" because these are full paths, and not single elements.
+                    keyData.add(RbPath.parse(bcpKey.substring(1)), key);
+                    continue;
+                }
+                if (bcpKey.equals(key)) {
+                    // An empty value indicates that the BCP47 key is same as the legacy key.
+                    bcpKey = "";
+                }
+                keyData.add(RB_KEYMAP.extendBy(key), bcpKey);
+            }
+            // Add aliases for timezone data.
+            keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone");
+            keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone");
+            keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz");
+        }
+
+        private final class ValueCollector implements ValueVisitor {
+            // Mutable ICU data collected into during visitation.
+            private final IcuData icuData;
+
+            ValueCollector(IcuData data) {
+                this.icuData = checkNotNull(data);
+            }
+
+            @Override
+            public void visit(CldrValue value) {
+                checkArgument(TYPE.matchesSuffixOf(value.getPath()),
+                    "unexpected child element: %s", value.getPath());
+                String typeName = TYPE_NAME.valueFrom(value);
+                // Note that if a "preferred" type exists, we treat the value specially and add
+                // it only as an alias. We expected values with a preferred replacement to
+                // always be explicitly deprecated.
+                Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value);
+                if (prefName.isPresent()) {
+                    checkState(KEY_DEPRECATED.booleanValueFrom(value, false)
+                            || TYPE_DEPRECATED.booleanValueFrom(value, false),
+                        "unexpected 'preferred' attribute for non-deprecated value: %s", value);
+                    icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get());
+                    return;
+                }
+                // Note: There are some deprecated values which don't have a preferred
+                // replacement and these will be processed below (in particular we need to emit
+                // the fact that they are deprecated).
+
+                // According to the old mapper code, it's an error not to have an alias, but
+                // it's emitted via debug logging and not actually enforced.
+                // TODO: Consider making this an error if possible.
+                String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName));
+
+                keyMap.put(keyName, keyAlias);
+                RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias);
+                List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value);
+                if (typeAliases.isEmpty()) {
+                    // Generate type map entry using empty value (an empty value indicates same
+                    // type name is used for both BCP47 and legacy type).
+                    icuData.add(typeMapPrefix.extendBy(typeName), "");
+                } else {
+                    String mainAlias = typeAliases.get(0);
+                    icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName);
+                    // Put additional aliases as secondary aliases referencing the main alias.
+                    RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias);
+                    typeAliases.stream()
+                        .skip(1)
+                        .map(Bcp47Visitor::quoteAlias)
+                        .forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias));
+                }
+                addInfoAttributes(keyName, typeName, value.getValueAttributes());
+            }
+
+            // Add any additional attributes present to the attribute map. Note that this code was
+            // copied from largely undocumented code, and the precise reasoning for why this is
+            // needed or why it's done this way is not completely clear. It is very likely that it
+            // can be simplified.
+            //
+            // The '@' symbol added here is just a magic token that gets stripped off again in the
+            // addKeyMapValues() method, it appears to just be a way to distinguish keys added via
+            // this method vs during the visit method. A better approach might just be to have two
+            // maps.
+            // TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?).
+            private void addInfoAttributes(
+                String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) {
+                // Only emit deprecation for the "key" level, even if all types below that are also
+                // marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES).
+                Set<AttributeKey> keys =
+                    Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet());
+                for (AttributeKey a : keys) {
+                    String value = attributes.get(a);
+                    // Skip empty or default values in attributes.
+                    if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) {
+                        continue;
+                    }
+                    // The ID for the xxxInfo paths in ICU is the path fragment at which the
+                    // attribute exists. Since we only process complete paths here, we must do a
+                    // bit of reconstruction based on the element name of the attribute we are
+                    // processing. This relies on explicit knowledge that the paths are "<key>" or
+                    // "<key>/<type>". This all gets less messy if we switch to RbPath.
+                    String id =
+                        a.getElementName().equals("key") ? keyName : keyName + "/" + typeName;
+                    keyMap.put(
+                        "@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id,
+                        value);
+                }
+            }
+        }
+
+        /**
+         * Escapes alias values containing '/' so they can appear in resource bundle paths. This
+         * function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar").
+         *
+         * <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar'
+         * in the CLDR data.
+         */
+        // TODO: Switch to RbPath and do quoting automatically when ICU data is written out.
+        private static String quoteAlias(String str) {
+            return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"';
+        }
+    }
+
+    private Bcp47Mapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java

new file mode 100644 (file)

index 0000000..15a4f98
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java
@@ -0,0 +1,147 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.escape.UnicodeEscaper;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
+ * paths matching:
+ * <pre>{@code
+ *   //ldml/segmentations/segmentation/suppressions/suppression
+ *   //ldml/special/icu:breakIteratorData/...
+ * }</pre>
+ */
+// TODO: This class can almost certainly be replace with a small RegexTransformer config.
+public final class BreakIteratorMapper {
+    // The "type" attribute is not required here, so cannot appear in the matcher.
+    private static final PathMatcher SUPPRESSION =
+        PathMatcher.of("ldml/segmentations/segmentation/suppressions/suppression");
+    private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
+
+    // Note: This could be done with an intermediate matcher for
+    // "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it
+    private static final PathMatcher BOUNDARIES =
+        PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*");
+    private static final PathMatcher DICTIONARY =
+        PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary");
+
+    private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
+    private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
+
+    /**
+     * Processes data from the given supplier to generate break-iterator data for a set of locale
+     * IDs.
+     *
+     * @param localeId the locale ID to generate data for.
+     * @param src the CLDR data supplier to process.
+     * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+     * @return IcuData containing break-iterator data for the given locale ID.
+     */
+    public static IcuData process(
+        String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+        BreakIteratorMapper mapper = new BreakIteratorMapper(localeId);
+        icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, mapper::addSpecials));
+        src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, mapper::addSuppression);
+        return mapper.icuData;
+    }
+
+    // The per-locale ICU data being collected by this visitor.
+    private final IcuData icuData;
+
+    private BreakIteratorMapper(String localeId) {
+        this.icuData = new IcuData(localeId, true);
+    }
+
+    private void addSuppression(CldrValue v) {
+        if (SUPPRESSION.matches(v.getPath())) {
+            String type = SEGMENTATION_TYPE.valueFrom(v);
+            // TODO: Understand and document why we escape values here, but not for collation data.
+            icuData.add(
+                RbPath.of("exceptions", type + ":array"),
+                ESCAPE_NON_ASCII.escape(v.getValue()));
+        }
+    }
+
+    private void addSpecials(CldrValue v) {
+        CldrPath p = v.getPath();
+        if (BOUNDARIES.matches(p)) {
+            addDependency(
+                getDependencyName(v),
+                getBoundaryType(v),
+                getBoundaryDependency(v));
+        } else if (DICTIONARY.matches(p)) {
+            addDependency(
+                getDependencyName(v),
+                DICTIONARY_TYPE.valueFrom(v),
+                DICTIONARY_DEP.optionalValueFrom(v));
+        }
+    }
+
+    private void addDependency(String name, String type, Optional<String> dependency) {
+        icuData.add(
+            RbPath.of(name, type + ":process(dependency)"),
+            dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
+    }
+
+    // Must match the BOUNDARIES or DICTIONARY path.
+    private static String getDependencyName(CldrValue value) {
+        return stripXmlNamespace(value.getPath().getParent().getName());
+    }
+
+    // Must match the BOUNDARIES path.
+    private static String getBoundaryType(CldrValue value) {
+        String elementName = value.getPath().getName();
+        String type = stripXmlNamespace(elementName);
+        return keyOf(elementName, "alt")
+            .optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
+    }
+
+    // Must match the BOUNDARIES path.
+    private static Optional<String> getBoundaryDependency(CldrValue value) {
+        return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
+    }
+
+    // Strips the first prefix of the form "xxx:" from a string.
+    private static String stripXmlNamespace(String s) {
+        return s.substring(s.indexOf(':') + 1);
+    }
+
+    /*
+     * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
+     * backslash to a double backslash. This class is super slow for non-ASCII escaping due to
+     * using "String.format()", however there's < 100 values that need any escaping, so it's fine.
+     */
+    private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
+        private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
+
+        @Override
+        protected char[] escape(int cp) {
+            // Returning null means "do not escape".
+            if (0x0020 <= cp && cp <= 0x007F) {
+                return cp == '\\' ? DOUBLE_BACKSLASH : null;
+            } else if (cp <= 0xFFFF) {
+                return String.format("\\u%04X", cp).toCharArray();
+            }
+            return String.format("\\U%08X", cp).toCharArray();
+        }
+    };
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java

new file mode 100644 (file)

index 0000000..bf9f740
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java
@@ -0,0 +1,198 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+/**
+ * A mapper to collect collation data from {@link CldrDataType#LDML LDML} data via the paths:
+ * <pre>{@code
+ *   //ldml/collations/*
+ *   //ldml/special/icu:UCARules
+ *   //ldml/special/icu:depends
+ * }</pre>
+ */
+public final class CollationMapper {
+    private static final PathMatcher COLLATIONS = PathMatcher.of("ldml/collations");
+
+    // Note that the 'type' attribute is optional, so cannot be in the path matcher.
+    // However since the CLDR data never actually omits the value, it would be easy to change the
+    // attribute metadata to stop it being an implicit attribute and then it could appear.
+    private static final PathMatcher COLLATION_RULE = PathMatcher.of("collation/cr");
+    private static final AttributeKey COLLATION_TYPE = keyOf("collation", "type");
+    private static final AttributeKey COLLATION_RULE_ALT = keyOf("cr", "alt");
+
+    private static final PathMatcher DEFAULT_COLLATION = PathMatcher.of("defaultCollation");
+
+    private static final PathMatcher SPECIAL = PathMatcher.of("ldml/special");
+    private static final AttributeKey SPECIAL_RULES = keyOf("icu:UCARules", "icu:uca_rules");
+    private static final AttributeKey SPECIAL_DEP = keyOf("icu:depends", "icu:dependency");
+
+    private static final RbPath RB_COLLATIONS_DEFAULT = RbPath.of("collations", "default");
+    private static final RbPath RB_STANDARD_SEQUENCE =
+        RbPath.of("collations", "standard", "Sequence");
+    private static final RbPath RB_STANDARD_VERSION =
+        RbPath.of("collations", "standard", "Version");
+
+    private static final Splitter LINE_SPLITTER =
+        Splitter.on('\n').trimResults().omitEmptyStrings();
+
+    /**
+     * Processes data from the given supplier to generate collation data for a set of locale IDs.
+     *
+     * @param localeId the locale ID to generate data for.
+     * @param src the CLDR data supplier to process.
+     * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+     * @return IcuData containing RBNF data for the given locale ID.
+     */
+    public static IcuData process(
+        String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+        CollationVisitor visitor = new CollationVisitor(localeId);
+        icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, visitor));
+        src.getDataForLocale(localeId, UNRESOLVED).accept(ARBITRARY, visitor);
+        return visitor.icuData;
+    }
+
+    final static class CollationVisitor implements PrefixVisitor {
+        private final IcuData icuData;
+
+        CollationVisitor(String localeId) {
+            this.icuData = new IcuData(localeId, true);
+            // Super special hack case because the XML data is a bit broken for the root collation
+            // data (there's an empty <collation> element that's a non-leaf element and thus not
+            // visited, but we should add an empty sequence to the output data.
+            if (localeId.equals("root")) {
+                icuData.replace(RB_STANDARD_SEQUENCE, "");
+                // TODO: Collation versioning probably needs to be improved.
+                icuData.replace(RB_STANDARD_VERSION, CldrDataSupplier.getCldrVersionString());
+            }
+        }
+
+        @Override
+        public void visitPrefixStart(CldrPath prefix, Context ctx) {
+            if (COLLATIONS.matchesPrefixOf(prefix)) {
+                ctx.install(this::collectRules);
+            } else if (SPECIAL.matchesPrefixOf(prefix)) {
+                ctx.install(this::maybeAddSpecial);
+            }
+        }
+
+        private void collectRules(CldrValue v) {
+            CldrPath p = v.getPath();
+            if (COLLATION_RULE.matchesSuffixOf(p)) {
+                String type = COLLATION_TYPE.valueFrom(v);
+                RbPath rbPath = RbPath.of("collations", type, "Sequence");
+
+                // WARNING: This is almost certainly a bug, since while @type can have the value
+                // "short" it can also have other values. This code was copied from CollationMapper
+                // which has the line;
+                //   isShort = attr.getValue("alt") != null;
+                boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent();
+
+                // Note that it's not clear why there's a check for "contains()" here. The code
+                // from which this was derived is largely undocumented and this check could have
+                // been overly defensive (perhaps a duplicate key should be an error?).
+                if (isShort || !icuData.contains(rbPath)) {
+                    RbValue rules = RbValue.of(
+                        LINE_SPLITTER.splitToList(v.getValue()).stream()
+                            .map(CollationMapper::removeComment)
+                            .filter(s -> !s.isEmpty())::iterator);
+                    icuData.replace(rbPath, rules);
+                    icuData.replace(
+                        RbPath.of("collations", type, "Version"),
+                        CldrDataSupplier.getCldrVersionString());
+                }
+            } else if (DEFAULT_COLLATION.matchesSuffixOf(p)) {
+                icuData.add(RB_COLLATIONS_DEFAULT, v.getValue());
+            }
+        }
+
+        // This is a bit special since the attribute we want to add depends on the element we are
+        // visiting (which is somewhat unusual in the transformation classes).
+        private void maybeAddSpecial(CldrValue value) {
+            AttributeKey key;
+            switch (value.getPath().getName()) {
+            case "icu:UCARules":
+                key = SPECIAL_RULES;
+                break;
+            case "icu:depends":
+                key = SPECIAL_DEP;
+                break;
+            default:
+                return;
+            }
+            // substring(4) just removes the "icu:" prefix (which we know is present in the key).
+            RbPath rbPath = RbPath.of(
+                String.format("%s:process(%s)",
+                    key.getElementName().substring(4), key.getAttributeName().substring(4)));
+            icuData.add(rbPath, key.valueFrom(value));
+        }
+    }
+
+    // Collation data can contain # to mark an end-of-line comment, but it can also contain data
+    // with # in it. In the latter case it must be in a single-quoted string (e.g. 'x#y'). However
+    // the precise semantics of the quoting rules are not particularly clear, so this method
+    // assumes that:
+    // * single quote (apostrophe) begins and ends quoting.
+    // * outside a quoted section, all characters are literal.
+    // * inside a quoted section, backslash '\' escapes any single character (e.g \a, \', \\)
+    private static String removeComment(String s) {
+        int i = findCommentStart(s);
+        if (i >= 0) {
+            s = CharMatcher.whitespace().trimTrailingFrom(s.substring(0, i));
+        }
+        return s;
+    }
+
+    // Returns the index of the first unquoted '#' in the string.
+    private static int findCommentStart(String s) {
+        boolean quoted = false;
+        for (int i = 0; i < s.length(); i++) {
+            switch (s.charAt(i)) {
+            case '\'':
+                quoted = !quoted;
+                break;
+
+            case '\\':
+                if (quoted) {
+                    i++;
+                }
+                break;
+
+            case '#':
+                if (!quoted) {
+                    return i;
+                }
+                break;
+
+            default:
+                // Do nothing and consume the character
+            }
+        }
+        checkArgument(!quoted, "mismatched quotes in: %s", s);
+        return -1;
+    }
+
+    private CollationMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java

new file mode 100644 (file)

index 0000000..8235c9c
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java
@@ -0,0 +1,98 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect day-period data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
+ * data via the paths:
+ * <pre>{@code
+ *   //supplementalData/dayPeriodRuleSet/*
+ * }</pre>
+ */
+public final class DayPeriodsMapper {
+    private static final PathMatcher RULESET =
+        PathMatcher.of("supplementalData/dayPeriodRuleSet");
+    private static final AttributeKey RULESET_TYPE = keyOf("dayPeriodRuleSet", "type");
+
+    private static final PathMatcher RULES = PathMatcher.of("dayPeriodRules[@locales=*]");
+    private static final AttributeKey RULES_LOCALES = keyOf("dayPeriodRules", "locales");
+
+    private static final PathMatcher RULE = PathMatcher.of("dayPeriodRule[@type=*]");
+    private static final AttributeKey RULE_TYPE = keyOf("dayPeriodRule", "type");
+
+    private static final RbPath RB_LOCALES = RbPath.of("locales");
+
+    /**
+     * Processes data from the given supplier to generate day-period ICU data.
+     *
+     * @param src the CLDR data supplier to process.
+     * @return the IcuData instance to be written to a file.
+     */
+    public static IcuData process(CldrDataSupplier src) {
+        RuleSetVisitor mapper = new RuleSetVisitor();
+        CldrData data = src.getDataForType(SUPPLEMENTAL);
+        data.accept(ARBITRARY, mapper);
+        return mapper.icuData;
+    }
+
+    private static final class RuleSetVisitor implements PrefixVisitor {
+        // Mutable ICU data collected into during visitation.
+        private final IcuData icuData = new IcuData("dayPeriods", false);
+        private int setNum = 0;
+
+        @Override
+        public void visitPrefixStart(CldrPath prefix, Context ctx) {
+            if (RULESET.matches(prefix)) {
+                ctx.install(new RuleVisitor(RULESET_TYPE.optionalValueFrom(prefix)));
+            }
+        }
+
+        private final class RuleVisitor implements PrefixVisitor {
+            private final RbPath localePrefix;
+
+            private RuleVisitor(Optional<String> type) {
+                // If there's a given type, add it to the prefix path.
+                this.localePrefix = type.map(t -> RbPath.of("locales_" + t)).orElse(RB_LOCALES);
+            }
+
+            @Override
+            public void visitPrefixStart(CldrPath prefix, Context ctx) {
+                if (RULES.matchesSuffixOf(prefix)) {
+                    // Sets are arbitrarily identified by the string "setNN".
+                    String setName = "set" + (++setNum);
+                    RULES_LOCALES.listOfValuesFrom(prefix)
+                        .forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName));
+                    ctx.install(this::visitRule);
+                }
+            }
+
+            private void visitRule(CldrValue value) {
+                if (RULE.matchesSuffixOf(value.getPath())) {
+                    RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value));
+                    value.getValueAttributes()
+                        .forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v));
+                }
+            }
+        }
+    }
+
+    private DayPeriodsMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/LocaleMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/LocaleMapper.java

new file mode 100644 (file)

index 0000000..2395d6f
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/LocaleMapper.java
@@ -0,0 +1,183 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.Ordering.natural;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.SetMultimap;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+import org.unicode.icu.tool.cldrtoicu.SupplementalData;
+
+/**
+ * Generate locale {@link IcuData} by transforming {@link CldrDataType#LDML LDML} data using a
+ * {@link PathValueTransformer}.
+ *
+ * <p>This is currently driven by the {@code ldml2icu_locale.txt} configuration file via a
+ * {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
+ */
+public final class LocaleMapper {
+    // Match territory paths so we can skip processing deprecated territories.
+    private static final PathMatcher TERRITORY = PathMatcher.of(
+        "ldml/localeDisplayNames/territories/territory[@type=*]");
+    private static final AttributeKey TERRITORY_TYPE = keyOf("territory", "type");
+
+    // The default calendar (only set is different from inherited parent value).
+    private static final RbPath RB_CALENDAR = RbPath.of("calendar", "default");
+
+    /**
+     * Processes data from the given supplier to generate general locale data for the given locale
+     * ID.
+     *
+     * @param localeId the locale ID to generate data for.
+     * @param src the CLDR data supplier to process.
+     * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+     * @param transformer the transformer to match and transform each CLDR path/value pair.
+     * @param supplementalData additional necessary data derived from
+     *        {@link org.unicode.cldr.api.CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data.
+     * @return IcuData containing locale data for the given locale ID.
+     */
+    public static IcuData process(
+        String localeId,
+        CldrDataSupplier src,
+        Optional<CldrData> icuSpecialData,
+        PathValueTransformer transformer,
+        SupplementalData supplementalData) {
+
+        IcuData icuData = new IcuData(localeId, true);
+        // Write out the results into the IcuData class, preserving result grouping and expanding
+        // path references as necessary.
+        ResultsCollector collector = new ResultsCollector(transformer);
+        icuData.addResults(collector.collectResultsFor(localeId, src, icuSpecialData));
+        doDateTimeHack(icuData);
+        supplementalData.getDefaultCalendar(icuData.getName())
+            .ifPresent(c -> icuData.add(RB_CALENDAR, c));
+        return icuData;
+    }
+
+    // This is an awful hack for post-processing the date-time format patterns to inject a 13th
+    // pattern at index 8, which is just a duplicate of the "medium" date-time pattern. The reasons
+    // for this are lost in the midst of time, but essentially there's ICU library code that just
+    // expects the value at index 8 to be this "default" value, and reads the date-time values
+    // starting at index 9.
+    //
+    // Before the hack would be at index 10, since there are 3 groups:
+    //   "time" -> "date" -> "date-time"
+    // with 4 patterns each:
+    //   "full" -> "long" -> "medium" -> "short"
+    private static void doDateTimeHack(IcuData icuData) {
+        for (RbPath rbPath : icuData.getPaths()) {
+            if (rbPath.length() == 3
+                && rbPath.getSegment(0).equals("calendar")
+                && rbPath.getSegment(2).equals("DateTimePatterns")) {
+                // This cannot be null and should not be empty, since the path is in this data.
+                List<RbValue> valuesToHack = icuData.get(rbPath);
+                checkArgument(valuesToHack.size() == 12,
+                    "unexpected number of date/time patterns for '%s': %s", rbPath, valuesToHack);
+                valuesToHack.add(8, valuesToHack.get(10));
+            }
+        }
+    }
+
+    private static final class ResultsCollector {
+        private final PathValueTransformer transformer;
+        private final Set<RbPath> validRbPaths = new HashSet<>();
+
+        // WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
+        // each key. The reason is that result comparison is not "consistent with equals", and
+        // TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
+        // method), and it does this even if using the add() method of the sorted set (this is in
+        // fact in violation of the stated behaviour of Set#add).
+        private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
+
+        ResultsCollector(PathValueTransformer transformer) {
+            this.transformer = checkNotNull(transformer);
+        }
+
+        ImmutableListMultimap<RbPath, Result> collectResultsFor(
+            String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+            CldrData unresolved = src.getDataForLocale(localeId, UNRESOLVED);
+            CldrData resolved = src.getDataForLocale(localeId, RESOLVED);
+            DynamicVars varFn = p -> {
+                CldrValue cldrValue = resolved.get(p);
+                return cldrValue != null ? cldrValue.getValue() : null;
+            };
+
+            collectPaths(unresolved, varFn);
+            collectResults(resolved, varFn);
+            icuSpecialData.ifPresent(s -> collectSpecials(s, varFn));
+
+            ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
+            out.orderValuesBy(natural());
+            for (RbPath rbPath : resultsByRbPath.keySet()) {
+                Set<Result> existingResults = resultsByRbPath.get(rbPath);
+                out.putAll(rbPath, existingResults);
+                for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
+                    if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
+                        out.put(rbPath, fallback);
+                    }
+                }
+            }
+            return out.build();
+        }
+
+        private void collectPaths(CldrData unresolved, DynamicVars varFn) {
+            ValueVisitor collectPaths =
+                v -> transformer.transform(v, varFn).forEach(this::collectResultPath);
+            unresolved.accept(DTD, collectPaths);
+        }
+
+        private void collectResultPath(Result result) {
+            RbPath rbPath = result.getKey();
+            validRbPaths.add(rbPath);
+            if (rbPath.isAnonymous()) {
+                RbPath parent = rbPath.getParent();
+                checkState(!parent.isAnonymous(),
+                    "anonymous paths should not be nested: %s", rbPath);
+                validRbPaths.add(parent);
+            }
+        }
+
+        void collectResults(CldrData resolved, DynamicVars varFn) {
+            ValueVisitor collectResults =
+                v -> transformer.transform(v, varFn).stream()
+                    .filter(r -> validRbPaths.contains(r.getKey()))
+                    .forEach(r -> resultsByRbPath.put(r.getKey(), r));
+            resolved.accept(DTD, collectResults);
+        }
+
+        private void collectSpecials(CldrData cldrData, DynamicVars varFn) {
+            cldrData.accept(DTD, v ->
+                transformer.transform(v, varFn).forEach(r -> resultsByRbPath.put(r.getKey(), r)));
+        }
+    }
+
+    private LocaleMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java

new file mode 100644 (file)

index 0000000..74542e2
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java
@@ -0,0 +1,88 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
+ * the paths:
+ * <pre>{@code
+ *   //supplementalData/plurals/pluralRanges[@locales=*]/...
+ * }</pre>
+ */
+public final class PluralRangesMapper {
+    private static final PathMatcher RANGES =
+        PathMatcher.of("supplementalData/plurals/pluralRanges[@locales=*]");
+    private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales");
+
+    private static final PathMatcher RANGE = PathMatcher.of("pluralRange[@start=*][@end=*]");
+    private static final AttributeKey RANGE_START = keyOf("pluralRange", "start");
+    private static final AttributeKey RANGE_END = keyOf("pluralRange", "end");
+    private static final AttributeKey RANGE_RESULT = keyOf("pluralRange", "result");
+
+    private static final RbPath RB_RULES = RbPath.of("rules");
+    private static final RbPath RB_LOCALES = RbPath.of("locales");
+
+    /**
+     * Processes data from the given supplier to generate plural-range ICU data.
+     *
+     * @param src the CLDR data supplier to process.
+     * @return the IcuData instance to be written to a file.
+     */
+    public static IcuData process(CldrDataSupplier src) {
+        PluralRangesVisitor visitor = new PluralRangesVisitor();
+        CldrData data = src.getDataForType(SUPPLEMENTAL);
+        data.accept(ARBITRARY, visitor);
+        return visitor.icuData;
+    }
+
+    private static final class PluralRangesVisitor implements PrefixVisitor {
+        private final IcuData icuData = new IcuData("pluralRanges", false);
+
+        private int setIndex = 0;
+        private String ruleLabel = null;
+
+        @Override
+        public void visitPrefixStart(CldrPath prefix, Context ctx) {
+            // Captured type is either "cardinal" or "ordinal" (and will cause exception otherwise).
+            if (RANGES.matches(prefix)) {
+                ruleLabel = String.format("set%02d", setIndex++);
+                RANGES_LOCALES.listOfValuesFrom(prefix)
+                    .forEach(l -> icuData.add(RB_LOCALES.extendBy(l), ruleLabel));
+                ctx.install(this::visitRange);
+            }
+        }
+
+        private void visitRange(CldrValue value) {
+            checkState(RANGE.matchesSuffixOf(value.getPath()),
+                "unexpected path: %s", value.getPath());
+            // Note: "range:start" and "range:end" are optional attributes, but the CLDR DTD
+            // specifies a default via comments. They should probably be changed to just have a
+            // default in the DTD (and possibly converted to use an enum here).
+            icuData.add(RB_RULES.extendBy(ruleLabel),
+                RbValue.of(
+                    RANGE_START.valueFrom(value, "all"),
+                    RANGE_END.valueFrom(value, "all"),
+                    RANGE_RESULT.valueFrom(value)));
+        }
+    }
+
+    private PluralRangesMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java

new file mode 100644 (file)

index 0000000..d20d31d
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java
@@ -0,0 +1,150 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
+ * the paths:
+ * <pre>{@code
+ *   //supplementalData/plurals[@type=*]/pluralRules[@locales=*]/pluralRule[@count=*]
+ * }</pre>
+ */
+public final class PluralsMapper {
+    private static final PathMatcher PLURALS = PathMatcher.of("supplementalData/plurals[@type=*]");
+    private static final AttributeKey PLURALS_TYPE = keyOf("plurals", "type");
+
+    private static final PathMatcher RULES = PathMatcher.of("pluralRules[@locales=*]");
+    private static final AttributeKey RULES_LOCALES = keyOf("pluralRules", "locales");
+
+    private static final PathMatcher RULE = PathMatcher.of("pluralRule[@count=*]");
+    private static final AttributeKey RULE_COUNT = keyOf("pluralRule", "count");
+
+    private static final ImmutableMap<String, RbPath> ICU_PREFIX_MAP =
+        ImmutableMap.of("cardinal", RbPath.of("locales"), "ordinal", RbPath.of("locales_ordinals"));
+
+    /**
+     * Processes data from the given supplier to generate plural ICU data.
+     *
+     * @param src the CLDR data supplier to process.
+     * @return the IcuData instance to be written to a file.
+     */
+    public static IcuData process(CldrDataSupplier src) {
+        PluralsVisitor visitor = new PluralsVisitor();
+        CldrData data = src.getDataForType(SUPPLEMENTAL);
+        // Note: We explicitly reset the type to mimic the order of the existing code, since this
+        // affects the set indices we generate during processing. Ideally this would all be immune
+        // to ordering (or just enforce DTD ordering) but right now it's very dependent on
+        // mimicking the order of the existing code to get identical output.
+        data.accept(ARBITRARY, visitor.setType("cardinal"));
+        data.accept(ARBITRARY, visitor.setType("ordinal"));
+        return visitor.icuData;
+    }
+
+    private static final class PluralsVisitor implements PrefixVisitor {
+        // Mutable ICU data collected into during visitation.
+        // In a post XML-aware API, is recording the XML file names really a good idea?
+        private final IcuData icuData = new IcuData("plurals", false);
+        // Filter for the type we are processing now (this could be removed if we don't mind which
+        // order the types are processed, and switching to DTD ordering would make it stable).
+        private String type = null;
+        private final List<ImmutableMap<String, String>> previousRules = new ArrayList<>();
+
+        // Hack method to allow a single type to be processed at a time (the visitor would otherwise
+        // happily handle both types in a single pass). We can't do this as two different visitors
+        // (one for each type) because the current behaviour relies on carrying over the calculated
+        // set numbers from one pass to the next. Once migration is complete we should revisit this
+        // and allow this visitor to work in a single pass (probably with DTD order for stability).
+        PluralsVisitor setType(String type) {
+            this.type = checkNotNull(type);
+            return this;
+        }
+
+        @Override
+        public void visitPrefixStart(CldrPath prefix, Context ctx) {
+            if (PLURALS.matches(prefix)) {
+                // Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a
+                // default via comments. It should probably be changed to just have a default in
+                // the DTD.
+                if (PLURALS_TYPE.valueFrom(prefix, "cardinal").equals(type)) {
+                    ctx.install(new RulesVisitor(ICU_PREFIX_MAP.get(type)));
+                }
+            }
+        }
+
+        private final class RulesVisitor implements PrefixVisitor {
+            private final RbPath icuPrefix;
+            private final List<String> locales = new ArrayList<>();
+            private final Map<String, String> rules = new LinkedHashMap<>();
+
+            RulesVisitor(RbPath icuPrefix) {
+                this.icuPrefix = checkNotNull(icuPrefix);
+            }
+
+            @Override
+            public void visitPrefixStart(CldrPath prefix, Context ctx) {
+                if (RULES.matchesSuffixOf(prefix)) {
+                    Iterables.addAll(locales, RULES_LOCALES.listOfValuesFrom(prefix));
+                    ctx.install(value -> {
+                        if (RULE.matchesSuffixOf(value.getPath())) {
+                            rules.put(RULE_COUNT.valueFrom(value), value.getValue());
+                        }
+                    });
+                }
+            }
+
+            @Override
+            public void visitPrefixEnd(CldrPath prefix) {
+                checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix);
+                // Note: The original mapper code "sort of" coped with empty rules, but it's not
+                // completely well behaved (or documented), so since this doesn't happen in the
+                // current CLDR data, I decided to just prohibit it in the new code. Support can
+                // easily be added in once the expected semantics are clear.
+                checkState(!rules.isEmpty(), "missing rule data for plurals: %s", prefix);
+
+                // Have we seen this set of rules before? If so, reuse the existing index. Note
+                // that an IDE might report this call as suspicious because the key is not yet an
+                // immutable map (saves creating immutable maps just to check for inclusion) but
+                // this is fine because collection equality is based only on contents, not
+                // collection type.
+                int idx = previousRules.indexOf(rules);
+                if (idx == -1) {
+                    int newIdx = previousRules.size();
+                    rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v));
+                    // Since "rules" is mutable and reused, we must take an immutable copy here.
+                    previousRules.add(ImmutableMap.copyOf(rules));
+                    idx = newIdx;
+                }
+                String setName = "set" + idx;
+                locales.forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName));
+                rules.clear();
+                locales.clear();
+            }
+        }
+    }
+
+    private PluralsMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java

new file mode 100644 (file)

index 0000000..f7f4f73
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java
@@ -0,0 +1,145 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.escape.UnicodeEscaper;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#LDML LDML} data via the paths:
+ * <pre>{@code
+ *   //ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]
+ * }</pre>
+ */
+// TODO: This class can almost certainly be written using RegexTransformer and a small config.
+public final class RbnfMapper {
+    private static final PathMatcher RULE_SET =
+        PathMatcher.of("ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]");
+    private static final AttributeKey GROUPING_TYPE = keyOf("rulesetGrouping", "type");
+    private static final AttributeKey RULESET_TYPE = keyOf("ruleset", "type");
+
+    private static final PathMatcher RBNF_RULE = PathMatcher.of("rbnfrule");
+    private static final AttributeKey RBNF_VALUE = keyOf("rbnfrule", "value");
+    private static final AttributeKey RBNF_RADIX = keyOf("rbnfrule", "radix");
+    private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access");
+
+    private static final RbPath RB_PARENT = RbPath.of("%%Parent");
+    // This is the ICU path prefix, below which everything generated by this visitor will go.
+    private static final RbPath RB_ROOT = RbPath.of("RBNFRules");
+
+    /**
+     * Processes data from the given supplier to generate RBNF data for a set of locale IDs.
+     *
+     * @param localeId the locale ID to generate data for.
+     * @param src the CLDR data supplier to process.
+     * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+     * @return IcuData containing RBNF data for the given locale ID.
+     */
+    public static IcuData process(
+        String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+        // Using DTD order is essential here because the RBNF paths contain ordered elements,
+        // so we must ensure that they appear in sorted order (otherwise we'd have to do more
+        // work at this end to re-sort the results).
+        RulesetVisitor visitor = new RulesetVisitor(localeId);
+        icuSpecialData.ifPresent(s -> s.accept(DTD, visitor));
+        src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, visitor);
+        return visitor.icuData;
+    }
+
+    static final class RulesetVisitor implements PrefixVisitor {
+
+        private final IcuData icuData;
+
+        private RulesetVisitor(String localeId) {
+            this.icuData = new IcuData(localeId, true);
+        }
+
+        @Override public void visitPrefixStart(CldrPath prefix, Context context) {
+            if (RULE_SET.matchesPrefixOf(prefix)) {
+                RbPath rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix));
+                String rulesetType = RULESET_TYPE.valueFrom(prefix);
+                boolean isStrict = !"lenient-parse".equals(rulesetType);
+
+                // This is rather hacky because the access attribute lives on the parent path
+                // element, but we cannot use it until we visit the child values (because it's a
+                // value attribute and will not be in the prefix path. So we need to add the header
+                // only once, just before we start adding the values relating to the child
+                // elements, so we need a flag.
+                //
+                // This cannot be a boolean field since it must be "effectively final".
+                AtomicBoolean hasHeader = new AtomicBoolean(false);
+                context.install(
+                    value -> {
+                        if (RBNF_RULE.matchesSuffixOf(value.getPath())) {
+                            if (!hasHeader.get()) {
+                                boolean isPrivate =
+                                    RULESET_ACCESS.valueFrom(value, "public").equals("private");
+                                icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":");
+                                hasHeader.set(true);
+                            }
+                            String rulePrefix = "";
+                            if (isStrict) {
+                                String basePrefix = RBNF_VALUE.valueFrom(value);
+                                rulePrefix = RBNF_RADIX.optionalValueFrom(value)
+                                    .map(r -> basePrefix + "/" + r)
+                                    .orElse(basePrefix);
+                                rulePrefix += ": ";
+                            }
+                            icuData.add(
+                                rbPath,
+                                rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue()));
+                        }
+                    });
+            }
+        }
+
+        /*
+         * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
+         * backslash to a double backslash. This class is super slow for non-ASCII escaping due to
+         * using "String.format()", however there's < 100 values that need any escaping, so it's
+         * fine.
+         */
+        private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() {
+            private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
+            private final char[] LEFT_ANGLE = "<".toCharArray();
+            private final char[] RIGHT_ANGLE = ">".toCharArray();
+
+            @Override
+            protected char[] escape(int cp) {
+                // Returning null means "do not escape".
+                switch (cp) {
+                case '\\':
+                    return DOUBLE_BACKSLASH;
+                case '←':
+                    return LEFT_ANGLE;
+                case '→':
+                    return RIGHT_ANGLE;
+                default:
+                    if (0x0020 <= cp && cp <= 0x007F) {
+                        return null;
+                    } else if (cp <= 0xFFFF) {
+                        return String.format("\\u%04X", cp).toCharArray();
+                    }
+                    return String.format("\\U%08X", cp).toCharArray();
+                }
+            }
+        };
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java

new file mode 100644 (file)

index 0000000..4817e1d
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java
@@ -0,0 +1,119 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.Ordering.natural;
+import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING;
+
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.SetMultimap;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * Generate supplemental {@link IcuData} by transforming {@link CldrDataType#SUPPLEMENTAL
+ * SUPPLEMENTAL} data using a {@link PathValueTransformer}.
+ *
+ * <p>This is currently driven by the {@code ldml2icu_supplemental.txt} configuration file via a
+ * {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
+ */
+public final class SupplementalMapper {
+    private static final RbPath RB_FIFO = RbPath.of("<FIFO>");
+
+    /**
+     * Processes a subset of supplemental data from the given supplier.
+     *
+     * @param src the CLDR data supplier to process.
+     * @param transformer the transformer to match and transform each CLDR path/value pair.
+     * @param icuName the name for the generated IcuData.
+     * @param includePaths a matcher to select the CLDR paths to be transformed.
+     * @return An IcuData instance containing the specified subset of supplemental data with the
+     *     given ICU name.
+     */
+    // TODO: Improve external data splitting and remove need for a PathMatcher here.
+    public static IcuData process(
+        CldrDataSupplier src, PathValueTransformer transformer, String icuName,
+        PathMatcher includePaths) {
+        ResultsCollector collector = new ResultsCollector(includePaths, transformer);
+        // Write out the results into the IcuData class, preserving result grouping and expanding
+        // path references as necessary.
+        IcuData icuData = new IcuData(icuName, false);
+        icuData.addResults(collector.getResults(src));
+        return icuData;
+    }
+
+    private static final class ResultsCollector {
+        private final PathMatcher pathMatcher;
+        private final PathValueTransformer transformer;
+
+        // WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
+        // each key. The reason is that result comparison is not "consistent with equals", and
+        // TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
+        // method), and it does this even if using the add() method of the sorted set (this is in
+        // fact in violation of the stated behaviour of Set#add).
+        private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
+        private int fifoCounter = 0;
+
+        ResultsCollector(PathMatcher pathMatcher, PathValueTransformer transformer) {
+            this.pathMatcher = checkNotNull(pathMatcher);
+            this.transformer = checkNotNull(transformer);
+        }
+
+        private void visit(CldrValue value) {
+            if (pathMatcher.matchesPrefixOf(value.getPath())) {
+                for (Result r : transformer.transform(value)) {
+                    RbPath rbPath = r.getKey();
+                    if (rbPath.contains(RB_FIFO)) {
+                        // The fifo counter needs to be formatted with leading zeros for sorting.
+                        rbPath = rbPath.mapSegments(
+                            s -> s.equals("<FIFO>") ? String.format("<%04d>", fifoCounter) : s);
+                    }
+                    resultsByRbPath.put(rbPath, r);
+                }
+                fifoCounter++;
+            }
+        }
+
+        ImmutableListMultimap<RbPath, Result> getResults(CldrDataSupplier supplier) {
+            // DTD and NESTED_GROUPING order differ because of how the magic <FIFO> label works (it
+            // basically enforces "encounter order" onto things in unlabeled sequences, which matches
+            // the old behaviour). If it wouldn't break anything, it might be worth moving to DTD order
+            // to remove any lingering implicit dependencies on the CLDR data behaviour.
+            CldrData supplementalData = supplier.getDataForType(CldrDataType.SUPPLEMENTAL);
+            PathValueTransformer.DynamicVars varFn = p -> {
+                CldrValue cldrValue = supplementalData.get(p);
+                return cldrValue != null ? cldrValue.getValue() : null;
+            };
+
+            supplementalData.accept(NESTED_GROUPING, this::visit);
+
+            ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
+            out.orderValuesBy(natural());
+            for (RbPath rbPath : resultsByRbPath.keySet()) {
+                Set<Result> existingResults = resultsByRbPath.get(rbPath);
+                out.putAll(rbPath, existingResults);
+                for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
+                    if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
+                        out.put(rbPath, fallback);
+                    }
+                }
+            }
+            return out.build();
+        }
+    }
+
+    private SupplementalMapper() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java

new file mode 100644 (file)

index 0000000..5a4741e
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java
@@ -0,0 +1,183 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.nio.file.StandardOpenOption.CREATE;
+import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Optional;
+import java.util.function.Function;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * A mapper to collect transliteration data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
+ * data via the paths:
+ * <pre>{@code
+ *   //supplementalData/transforms/transform/tRule
+ * }</pre>
+ *
+ * <p>This mapper also writes out the transform rule files into a specified directory.
+ */
+public final class TransformsMapper {
+    private static final PathMatcher TRULE =
+        PathMatcher.of("supplementalData/transforms/transform/tRule");
+    private static final AttributeKey TRANSFORM_SOURCE = keyOf("transform", "source");
+    private static final AttributeKey TRANSFORM_TARGET = keyOf("transform", "target");
+    private static final AttributeKey TRANSFORM_DIRECTION = keyOf("transform", "direction");
+    private static final AttributeKey TRANSFORM_VARIANT = keyOf("transform", "variant");
+    private static final AttributeKey TRANSFORM_VISIBILITY = keyOf("transform", "visibility");
+    private static final AttributeKey TRANSFORM_ALIAS = keyOf("transform", "alias");
+    private static final AttributeKey TRANSFORM_BACKALIAS = keyOf("transform", "backwardAlias");
+
+    private static final RbPath RB_TRANSLITERATOR_IDS = RbPath.of("RuleBasedTransliteratorIDs");
+
+    // This decomposes some accented characters with accents in the "Mn" (Mark, non-spacing)
+    // Unicode range by representing the accents in the \u1234 hex form. For example, it converts:
+    // "ɪ̈" to "ɪ\u0308" and "ɯ̽" to "ɯ\u033D". This does not affect all accented character (e.g.
+    // ä) and the precise reason this is done was never clearly documented in the code from which
+    // this code was derived (but it seems necessary to generate the expected output in the
+    // transliteration rules).
+    //
+    // This is one of the only, apparently necessary direct dependencies on the icu4j library.
+    // TODO: Make this depend icu4j from this project rather than the older version from CLDR.
+    private static final Transliterator FIXUP = Transliterator.getInstance("[:Mn:]any-hex/java");
+
+    // Don't rename these enum constants, they need to match the data directly.
+    private enum Direction { forward, backward, both }
+    private enum Visibility { internal, external }
+
+    /**
+     * Processes data from the given supplier to generate transliteration ICU data, writing
+     * auxiliary transliteration rule files in the process. This is a potentially destructive call
+     * and will overwrite existing transformation rule files in the specified directory.
+     *
+     * @param src the CLDR data supplier to process.
+     * @param ruleFileOutputDir the directory into which transliteration rule files will be written.
+     * @return the IcuData instance to be written to a file.
+     */
+    public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
+        RuleVisitor visitor = new RuleVisitor(p -> {
+            Path file = ruleFileOutputDir.resolve(p);
+            try {
+                return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
+            } catch (IOException e) {
+                throw new RuntimeException("error opening file: " + file, e);
+            }
+        });
+        src.getDataForType(SUPPLEMENTAL).accept(DTD, visitor);
+        return visitor.icuData;
+    }
+
+    private static class RuleVisitor implements ValueVisitor {
+        private final IcuData icuData = new IcuData("root", false);
+        private final Function<Path, PrintWriter> outFn;
+
+        RuleVisitor(Function<Path, PrintWriter> outFn) {
+            this.outFn = checkNotNull(outFn);
+            icuData.setFileComment("File: root.txt");
+
+            // I have _no_ idea what any of this is about, I'm just trying to mimic the original
+            // (complex and undocumented) code in "ConvertTransforms.java".
+            icuData.add(RbPath.of("TransliteratorNamePattern"), "{0,choice,0#|1#{1}|2#{1}-{2}}");
+            // Note that this quoting of path segments is almost certainly unnecessary. It matches
+            // the old "ConvertTransforms" behaviour, but '%' is used elsewhere without quoting, so
+            // it seems very likely that it's not needed here.
+            // TODO: Once migration done, remove quotes here & check in RbPath for unwanted quotes.
+            icuData.add(RbPath.of("\"%Translit%Hex\""), "%Translit%Hex");
+            icuData.add(RbPath.of("\"%Translit%UnicodeName\""), "%Translit%UnicodeName");
+            icuData.add(RbPath.of("\"%Translit%UnicodeChar\""), "%Translit%UnicodeChar");
+            // Special case, where Latin is a no-op.
+            icuData.add(RbPath.of("TransliterateLATIN"), RbValue.of("", ""));
+            // Some hard-coded special case mappings.
+            icuData.add(
+                RB_TRANSLITERATOR_IDS.extendBy("Tone-Digit", "alias"),
+                "Pinyin-NumericPinyin");
+            icuData.add(
+                RB_TRANSLITERATOR_IDS.extendBy("Digit-Tone", "alias"),
+                "NumericPinyin-Pinyin");
+        }
+
+        @Override public void visit(CldrValue value) {
+            // The other possible element is "comment" but we currently ignore those.
+            if (TRULE.matches(value.getPath())) {
+                String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE);
+                String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET);
+                Optional<String> variant = TRANSFORM_VARIANT.optionalValueFrom(value);
+                String baseFilename = source + "_" + target;
+                String filename =
+                    variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt";
+                writeRootIndexEntry(value, source, target, variant, filename);
+                writeDataFile(filename, value);
+            }
+        }
+
+        private void writeDataFile(String filename, CldrValue value) {
+            try (PrintWriter out = outFn.apply(Paths.get(filename))) {
+                out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
+                out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
+                out.println("#");
+                out.println("# File: " + filename);
+                out.println("# Generated from CLDR");
+                out.println("#");
+                out.println();
+                out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue())));
+                out.println();
+            }
+        }
+
+        private void writeRootIndexEntry(
+            CldrValue value, String source, String target, Optional<String> variant, String filename) {
+            Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class);
+            String status = visibility == Visibility.internal ? "internal" : "file";
+
+            Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class);
+            if (dir != Direction.backward) {
+                String id = getId(source, target, variant);
+                TRANSFORM_ALIAS.listOfValuesFrom(value)
+                    .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
+                RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
+                icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
+                icuData.add(rbPrefix.extendBy("direction"), "FORWARD");
+            }
+            if (dir != Direction.forward) {
+                String id = getId(target, source, variant);
+                TRANSFORM_BACKALIAS.listOfValuesFrom(value)
+                    .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
+                RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
+                icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
+                icuData.add(rbPrefix.extendBy("direction"), "REVERSE");
+            }
+        }
+    }
+
+    private static String getId(String from, String to, Optional<String> variant) {
+        String baseId = from + "-" + to;
+        return variant.map(v -> baseId + "/" + v).orElse(baseId);
+    }
+
+    private static String getExpectedOptionalAttribute(CldrValue value, AttributeKey key) {
+        return key.optionalValueFrom(value).orElseThrow(() ->
+            new IllegalArgumentException(String.format("missing data for %s in: %s", key, value)));
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Instruction.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Instruction.java

new file mode 100644 (file)

index 0000000..c95ac81
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Instruction.java
@@ -0,0 +1,26 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import com.google.common.base.Ascii;
+
+/** Instructions in result specifications (e.g. "values=..." or "fallback=..."). */
+enum Instruction {
+    /** Defines processing and transformation of CLDR values. */
+    VALUES,
+    /** Defines fallback values to be used if no result was matched in a resource bundle. */
+    FALLBACK,
+    /** Defines an xpath used to hack result equality to make deduplication work. */
+    BASE_XPATH,
+    // TODO: Figure out how to remove this hack (probably by supporting partial matches).
+    /**
+     * Defines whether result values should be appended one at a time to a resource bundle
+     * (default) or grouped into a separate array.
+     */
+    GROUP;
+
+    /** Returns the instruction enum for its ID as it appears in the configuration file. */
+    static Instruction forId(String id) {
+        return Instruction.valueOf(Ascii.toUpperCase(id));
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/NamedFunction.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/NamedFunction.java

new file mode 100644 (file)

index 0000000..2ee573d
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/NamedFunction.java
@@ -0,0 +1,58 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.List;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+
+/**
+ * Function used by {@code RegexTransformer} to convert CLDR values in special ways. See also
+ * {@code IcuFunctions}.
+ */
+public final class NamedFunction implements Function<List<String>, String> {
+    private static final CharMatcher NAME_CHARS =
+        CharMatcher.inRange('a', 'z').or(CharMatcher.is('_'));
+    private static final Splitter ARG_SPLITTER = Splitter.on(',').trimResults(whitespace());
+
+    public static NamedFunction create(
+        String name, int argCount, Function<List<String>, String> fn) {
+        return new NamedFunction(name, argCount, fn);
+    }
+
+    private final String name;
+    private final int maxArgs;
+    private final Function<List<String>, String> fn;
+
+    private NamedFunction(String name, int argCount, Function<List<String>, String> fn) {
+        checkArgument(!name.isEmpty() && NAME_CHARS.matchesAllOf(name),
+            "invalid function name (must be lower_case_underscore): %s", name);
+        checkArgument(argCount >= 0, "invalid argument count: %s", argCount);
+        this.name = name;
+        this.maxArgs = argCount;
+        this.fn = checkNotNull(fn);
+    }
+
+    public String call(String argList) {
+        List<String> args = ARG_SPLITTER.splitToList(argList);
+        checkArgument(args.size() <= maxArgs,
+            "too many arguments for function '%s' (max=%s)", name, maxArgs);
+        return checkNotNull(apply(args),
+            "named functions must never return null: function=%s", name);
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public String apply(List<String> args) {
+        return fn.apply(args);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformer.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformer.java

new file mode 100644 (file)

index 0000000..10d4c36
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformer.java
@@ -0,0 +1,173 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap;
+import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
+import static java.util.function.Function.identity;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.ImmutableSetMultimap;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files.
+ * See {@code ldml2icu_readme.txt} for details on the configuration file format and
+ * {@link PathValueTransformer} for the public API description and usage.
+ *
+ * <p>This class is thread safe.
+ */
+// TODO: Rewrite the readme to match current behaviour and describe edge cases properly.
+public final class RegexTransformer extends PathValueTransformer {
+    /**
+     * Returns a new transformer based on transformation rules defined in the given configuration
+     * file contents, and using the specified functions for resolving ICU values.
+     */
+    public static PathValueTransformer fromConfigLines(
+        List<String> lines, NamedFunction... functions) {
+        return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions)));
+    }
+
+    // Map of path prefixes grouped by DTD type (for early efficient filtering of paths).
+    private final ImmutableSetMultimap<CldrDataType, String> prefixMap;
+    // Transformation rules loading from the configuration file, grouped by path prefix.
+    private final ImmutableListMultimap<String, Rule> rulesMap;
+    // Functions which can generate a fallback value from a given resource bundle path.
+    private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions;
+    // Records the total set of rules, removing them as they are matched. Used for reporting any
+    // unused rules for debugging purposes.
+    private final Set<Rule> unusedRules = new LinkedHashSet<>();
+
+    private RegexTransformer(List<Rule> rules) {
+        this.prefixMap =
+            rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix));
+        this.rulesMap =
+            rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity()));
+        this.fallbackFunctions =
+            rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList());
+        // Add all rules first and remove as they are matched.
+        this.unusedRules.addAll(rules);
+    }
+
+    @Override
+    public ImmutableList<Result> transform(CldrValue value) {
+        return transform(value, p -> null);
+    }
+
+    @Override
+    public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) {
+        // This early rejection of non-matching paths, combined with "bucketing" the rules by path
+        // path prefix for easy lookup dramatically reduces the transformation time.
+        String pathPrefix = getPathPrefix(value);
+        if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) {
+            return ImmutableList.of();
+        }
+        // Even though this is just derived from the value, resolve it here and pass it into each
+        // rule to avoid recalculating the same thing every time.
+        String fullXPath = getFullXPathWithoutSortIndices(value);
+        // Bucketing the rules by the path prefix means that each incoming value is only tested
+        // against likely matches. This reduces the number of tests per value by about 10x.
+        for (Rule rule : rulesMap.get(pathPrefix)) {
+            // We break after the first matching rule, since there is an implicit assumption
+            // that no paths will match more than one rule.
+            // TODO: Add a debug mode that checks that only one rule matches any given CLDR path.
+            ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn);
+            if (!results.isEmpty()) {
+                unusedRules.remove(rule);
+                return results;
+            }
+        }
+        return ImmutableList.of();
+    }
+
+    // All "leaf" paths must have at least two elements, so we can find the "prefix" which is
+    // the first element after the DTD root. This corresponds to the value extracted via
+    // PATH_SPEC_PREFIX in the parser.
+    private static String getPathPrefix(CldrValue value) {
+        CldrPath prefix = value.getPath();
+        checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix);
+        while (prefix.getLength() > 2) {
+            prefix = prefix.getParent();
+        }
+        return prefix.getName();
+    }
+
+    // A regex to capture any sort-indices in the full path string (which must be removed).
+    private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+");
+
+    // Note that the full path we get here contains the "sort index" suffix for ORDERED
+    // elements. This means that some element names are "foo#N" where N is the sort index.
+    // Since the regex transformer works around "ordered elements" in a completely different
+    // way and doesn't have them in the regular expressions, we can just remove them.
+    private static String getFullXPathWithoutSortIndices(CldrValue v) {
+        String fullPath = v.getFullPath();
+        for (CldrPath p = v.getPath(); p != null; p = p.getParent()) {
+            if (p.getSortIndex() != -1) {
+                // Only do expensive regex stuff if there's an "ordered" element with a sort index.
+                return SORT_INDEX.matcher(fullPath).replaceAll("$1");
+            }
+        }
+        // No path parts have a sort index, so the original full path string is safe to return.
+        return fullPath;
+    }
+
+    @Override
+    public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) {
+        return fallbackFunctions.stream()
+            .map(f -> f.apply(rbPath, varLookupFn))
+            .filter(Optional::isPresent)
+            .map(Optional::get)
+            .collect(toImmutableList());
+    }
+
+    @Override public String toString() {
+        StringWriter buf = new StringWriter();
+        PrintWriter out = new PrintWriter(buf);
+        out.println(getClass().getName() + "{");
+        out.println("  Rules: " + rulesMap.size());
+        if (!unusedRules.isEmpty()) {
+            out.println("  Unused Rules:");
+            unusedRules.forEach(
+                r -> out.format("    [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec()));
+        }
+        out.println('}');
+        out.flush();
+        return buf.toString();
+    }
+
+    // Package use helper for substituting single-character place-holders like '$N' or '%X'.
+    static String substitute(String s, char token, Function<Character, String> replaceFn) {
+        if (s.indexOf(token) == -1) {
+            return s;
+        }
+        StringBuilder out = new StringBuilder();
+        int i = 0;
+        for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) {
+            char varChar = s.charAt(j + 1);
+            String replacement =
+                checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar);
+            out.append(s, i, j).append(replacement);
+        }
+        return out.append(s.substring(i)).toString();
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/ResultSpec.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/ResultSpec.java

new file mode 100644 (file)

index 0000000..7c1467e
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/ResultSpec.java
@@ -0,0 +1,632 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkElementIndex;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+import static java.util.Comparator.comparing;
+import static java.util.Comparator.nullsLast;
+import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A specification for building a result from the arguments in a matched xpath. Results always
+ * hold a reference to their originating specification to allow them to be ordered in the same
+ * order as the corresponding specifications in the configuration file.
+ */
+final class ResultSpec {
+    // Subtle ordering for results to ensure "config file order" for things in the same
+    // resource bundle while being "friendly" towards a global ordering. This is NOT consistent
+    // with equals if duplicate results exist.
+    //
+    // This is ESSENTIAL for correct grouping and ordering within resource bundles.
+    //
+    // In normal use this is expected only to be used to reorder results within a resource
+    // bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles
+    // themselves can just be managed in "visitation order" or similar.
+    //
+    // Ordering priority is:
+    // 1: Result key (resource bundle):     Groups results by resource bundle.
+    // 2: Result specification line number: Orders resource bundle contents by "file order".
+    // 3: Result distinguishing xpath:      Tie breaking if duplicates are not yet removed.
+    //
+    // Note that the currently uses the String representation of the resource bundle path (key)
+    // as the primary order to match legacy behaviour. However it would be better to use the
+    // natural lexicographical RbPath order (the difference relates to having '/' as the
+    // separator in the string representation of the path). The string form of a path is a bad
+    // choice because some paths can contain a literal '/', which makes ordering problematic in
+    // rare case. However changing this will have the effect of reodering path elements, which
+    // while it should be safe, must be done with caution.
+    // TODO: Fix this to use RbPath ordering and NOT the String representation
+    private static final Comparator<AbstractResult> RESULT_ORDERING =
+        Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString())
+            .thenComparing(r -> r.getSpec().lineNumber)
+            .thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null))));
+
+    // Splitter for any values (either in CLDR data or results specifications). The only time
+    // values are split differently is when quoting exists in the "values" instruction.
+    private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings();
+
+    // Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list.
+    private static final Pattern FUNCTION = Pattern.compile("\\&(\\w++)\\(([^\\)]++)\\)");
+
+    // Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it
+    // appears in the configuration file.
+    private final String rbPathSpec;
+
+    // Declared instructions with which to generate result values (see Instruction).
+    private final ImmutableMap<Instruction, VarString> instructions;
+
+    // This index of the xpath argument whose value should be split to create multiple results.
+    // This mechanism is used when an xpath attribute is a space separated list of values and
+    // one result should be created for each value (e.g. [@territories="AA BB CC"] but you want
+    // a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar").
+    // At most one argument is ever split (corresponding to the first unquoted placeholder in
+    // the resource bundle path specification).
+    private final int splitArgIndex;
+
+    // The line number of the result specification in the file which defines the ordering of
+    // results within a resource bundle. This needn't be a line number, but must be unique for
+    // each specification.
+    private final int lineNumber;
+
+    // The named functions available to the parser. Ideally the rules and result specifications
+    // would be an inner class of some kind of context/environment and just share this.
+    private final ImmutableMap<String, NamedFunction> icuFunctions;
+
+    // The map of dynamic variables (looked up from CldrPaths when a rule is resolved.
+    private final Function<Character, CldrPath> dynamicVarFn;
+
+    ResultSpec(
+        String rbPathSpec,
+        Map<Instruction, VarString> instructions,
+        int lineNumber,
+        Map<String, NamedFunction> icuFunctions,
+        Function<Character, CldrPath> dynamicVarFn) {
+        this.rbPathSpec = checkNotNull(rbPathSpec);
+        this.instructions = ImmutableMap.copyOf(instructions);
+        this.splitArgIndex = getSplitArgIndex(rbPathSpec);
+        this.lineNumber = lineNumber;
+        this.icuFunctions = ImmutableMap.copyOf(icuFunctions);
+        this.dynamicVarFn = checkNotNull(dynamicVarFn);
+    }
+
+    /**
+     * Transforms a path/value into a sequence of results. The given matcher has successfully
+     * matched the path and contains the captured arguments corresponding to $1..$N in the
+     * various result specification strings.
+     */
+    Stream<Result> transform(
+        CldrValue value, Matcher m, DynamicVars varLookupFn) {
+        // Discard group(0) since that's always the full xpath that was matched, and we don't
+        // need that any more (so "$N" is args.get(N - 1)).
+        List<String> args = new ArrayList<>();
+        for (int i = 1; i <= m.groupCount(); i++) {
+            // Important since we turn this into an ImmutableList (which is null-hostile).
+            args.add(checkNotNull(m.group(i),
+                "captured regex arguments must always be present\n"
+                    + "(use an non-capturing groups for optional arguments): %s", m.pattern()));
+        }
+
+        // The first unquoted argument in any resource bundle path declaration, is defined as
+        // being "splittable". Typically this happens if the value of the captured xpath
+        // argument is expected to be a list of items.
+        //
+        // In this case, we generate one result for each individual argument, replacing the
+        // appropriate captured list with each split value in turn. Thus with original
+        // arguments:
+        //   ["foo", "bar baz", "quux"]
+        // where splitArgIndex == 1, we get two results using the argument lists:
+        //   ["foo", "bar", "quux"]
+        //   ["foo", "baz", "quux"]
+        //
+        // Note also that since the splittability of the arguments is technically defined
+        // by the resource bundle path specification (not the xpath regular expression) it
+        // could differ per ResultSpec instance (but currently never does).
+        if (splitArgIndex != -1) {
+            List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex));
+            // Only bother if there was more than one argument there anyway.
+            if (splitArgs.size() > 1) {
+                return splitArgs.stream().map(a -> {
+                    args.set(splitArgIndex, a);
+                    return matchedResult(value, args, varLookupFn);
+                });
+            }
+        }
+        // No splittable argument, or a splittable argument with only one value.
+        return Stream.of(matchedResult(value, args, varLookupFn));
+    }
+
+    // Simple helper to make results.
+    private Result matchedResult(
+        CldrValue value, List<String> args, DynamicVars varLookupFn) {
+        return new MatchedResult(
+            getRbPath(args),
+            getValues(value.getValue(), args, varLookupFn),
+            getResultPath(value.getPath(), args, varLookupFn));
+    }
+
+    // Resource bundle paths are a bit special (unsurprisingly). The captured arguments can
+    // contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as
+    // "foo/x/y/bar" after argument substitution.
+    //
+    // However (a hack for timezone "metazone" paths) if the argument placeholder is quoted
+    // (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained
+    // (e.g. "foo/"x:y"/bar).
+    // TODO: Replace hard coded hack here with an explicit function in the config file.
+    private RbPath getRbPath(List<String> args) {
+        // Without more careful parsing, it's hard to figure out it quotes in a resource bundle
+        // path specification are around a placeholder or not. Since quotes are only used in a
+        // small number of cases currently, and only for this purpose, we just assume that any
+        // quotes in the path specification should trigger this behaviour.
+        if (rbPathSpec.contains("\"")) {
+            // Use a lazy transforming list to avoid char replacement in arguments that don't
+            // appear in the resource bundle path.
+            args = Lists.transform(args, s -> s.replace('/', ':'));
+        }
+        String path = substituteArgs(rbPathSpec, args);
+        return RbPath.parse(path);
+    }
+
+    // Create an array of output values according to the CLDR value (if present) and the
+    // "values" instruction in the result specification (if present). Any functions present in
+    // the "values" instruction are invoked here.
+    private ImmutableList<String> getValues(
+        String value, List<String> args, DynamicVars varLookupFn) {
+        VarString valuesSpec = instructions.get(Instruction.VALUES);
+        if (valuesSpec == null) {
+            // No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR
+            // value use "values={value}" in the result specification.
+            return ImmutableList.of(value);
+        }
+        // The "value" instruction is not expected to have any dynamic %N variables in it,
+        // since those only represent CLDR path mappings, which should not be directly present
+        // in the ICU data. Hence the valueSpec should have been fully resolved by the static
+        // variables applied earlier and we should just need to resolve() it into a String.
+        String resolved = valuesSpec.get();
+
+        // First substitute the $N arguments in since they need to be passed to the
+        // functions.
+        //
+        // WARNING: This doesn't strictly work, since an argument or function result could
+        // (in theory) contain the string "{value}" which would then be substituted in an
+        // unexpected way. The better way to do this is with a single pass which handles
+        // arguments, function calling and the special "{value}" token together. This comes
+        // down to the fact that the mapping file syntax doesn't have a well defined concept
+        // of escaping or invocation order.
+        // TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent.
+        resolved = substituteArgs(resolved, args);
+
+        Matcher m = FUNCTION.matcher(resolved);
+        if (m.find()) {
+            StringBuilder buffer = new StringBuilder();
+            int index = 0;
+            do {
+                // Append up to the start of the function call.
+                buffer.append(resolved, index, m.start());
+
+                // Replace '{value}' here so functions can be called with the CLDR value as well
+                // as captured path arguments. We also have to replace it below, which is all a bit
+                // dodgy if a function every returned '{value}'.
+                NamedFunction fn = icuFunctions.get(m.group(1));
+                checkArgument(fn != null, "no such function: %s", m.group(1));
+                buffer.append(fn.call(m.group(2).replace("{value}", value)));
+                index = m.end();
+            } while (m.find());
+            resolved = buffer.append(resolved.substring(index)).toString();
+        }
+        // Having done function invocation, we handle the special "{value}" token and split
+        // the value (taking quoting into account).
+        return splitValues(resolved.replace("{value}", value));
+    }
+
+    // IMPORTANT: The path of a result is either:
+    // * The original distinguishing path
+    // * The specified "base_xpath" (which must also be a distinguishing xpath).
+    // and this is used as part of the equality semantics (which are very subtle).
+    //
+    // The existence of "base_xpath" is a hack to get around the fact the xpaths can only be
+    // matched in full, rather than by a prefix. For some cases this means that the "same"
+    // result will be created many times by potentially different distinguishing xpaths,
+    // perhaps even via different result specifications. "base_xpath" exists as a hack to give
+    // these duplicate results the same "fake" xpath, so deduplication can occur.
+    private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) {
+        VarString basePath = instructions.get(Instruction.BASE_XPATH);
+        if (basePath == null) {
+            return path;
+        }
+        String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get();
+        return parseDistinguishingPath(substituteArgs(resolvedBasePath, args));
+    }
+
+    /**
+     * Returns a fallback function if this specification has the "fallback=" instruction.
+     * The function takes a resolved resource bundle path and returns the possible fallback
+     * values for it. Note that currently fallback values do not support either quoting or
+     * grouping (but they easily could).
+     */
+    Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() {
+        VarString fallbackSpec = instructions.get(Instruction.FALLBACK);
+        if (fallbackSpec == null) {
+            return Optional.empty();
+        }
+        // This is the only place where any hacking of regular expressions occurs. The fallback
+        // function must only return a value if the given resolved resource bundle path could
+        // have been a match for the path specification.
+        //
+        // In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which
+        // should not both be matched, we explicitly disallow '/' in argument values. In theory
+        // this is problematic, since '/' should be an allowed character, but the issues caused
+        // by ambiguous matching are worse.
+        // TODO: Fix/replace all of this fallback mess with something cleaner.
+        Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec);
+
+        // Another, frankly terrifying, bit of hackery to support fallback specifications with
+        // $N argument substitution (this currently only happens once, but must be supported).
+        // Just another reason to want to replace the current fallback mechanism.
+        fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec);
+
+        // Just copying here to make it effectively final.
+        VarString finalFallbackSpec = fallbackSpec;
+        return Optional.of(
+            (p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec));
+    }
+
+    private Optional<Result> getFallbackResult(
+        RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) {
+        // Check is the given rbPath could be associated with this fallback (most are not).
+        Matcher matcher = rbPathMatcher.matcher(rbPath.toString());
+        if (!matcher.matches()) {
+            return Optional.empty();
+        }
+        // Expect that once any dynamic variables are provided to the fallback specification,
+        // we can get the resolved fallback specification (potentially with $N placeholders to
+        // be filled in from the resource bundle path).
+        String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get();
+        if (matcher.groupCount() > 0) {
+            specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount());
+        }
+
+        // Split the fallback value _without_ considering quoting. This matches the original
+        // behaviour but could cause all sorts of subtle issues if values contained quotes.
+        // TODO: Rework transformation rules to make quoting behaviour deterministic.
+        Iterable<String> values =
+            VALUE_SPLITTER.splitToList(specStr).stream()
+                // Fallback values that "look like" CLDR paths are auto-magically resolved.
+                .map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v)
+                .collect(toImmutableList());
+        return Optional.of(new FallbackResult(rbPath, values));
+    }
+
+    // WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument
+    // substitutions are allowed in fallback values. This is highly problematic because
+    // since the fallback value must be synthesized only from the resource bundle path,
+    // there's no way for this substitution to handle:
+    // 1: multi-valued list arguments
+    // 2: arguments that didn't appear in the resource bundle path
+    // 3: dynamic path variables (e.g. %D=//some/path)
+    //
+    // An example would be something like a resource bundle specification of:
+    //   /Baz/$2/$1
+    // and a fallback value of:
+    //   Foo$1/Bar$2
+    //
+    // Here the order of substitution is not maintained and the original path specification
+    // has values that are not naturally ordered (or possibly even duplicated). The pattern
+    // we calculate from the resource bundle path specification will match/capture groups in
+    // "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the
+    // placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1").
+    // TODO: Figure out a way to remove all of this extreme complexity.
+    private VarString maybeRewriteFallbackSpec(
+        VarString fallbackSpec) {
+        Optional<String> fallback = fallbackSpec.resolve();
+        // If the fallback string is not present, it's because the VarString still has
+        // unresolved "dynamic" variables for late binding. This is okay, but should not
+        // be mixed with argument substitution.
+        if (!fallback.isPresent() || !fallback.get().contains("$")) {
+            return fallbackSpec;
+        }
+        // After the quick rejection check for '$', do a proper search for $N variables (since
+        // '$' is permitted as a literal if not followed by a digit).
+        Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get());
+        if (!fallbackMatcher.find()) {
+            return fallbackSpec;
+        }
+
+        // Fallback spec has $N in it, triggering super hacky behaviour.
+        Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec);
+        checkState(pathMatcher.find(),
+            "$N arguments in fallback must be present in the resource bundle path: %s",
+            rbPathSpec);
+        // Explicit group characters ("1"..."9") in the order they appear in the
+        // resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1").
+        List<Character> groupIds = new ArrayList<>();
+        do {
+            groupIds.add(pathMatcher.group().charAt(1));
+        } while (pathMatcher.find());
+
+        // Special check to avoid a horrible bug if we every had more than 9 distinct
+        // placeholders (essentially impossible with current data). If it did happen,
+        // the returned index below would be >= 9 and we would get "$X", where 'X' was
+        // not a numeric value.
+        checkState(groupIds.size() < 10,
+            "too many placeholders in resource bundle path: %s", rbPathSpec);
+
+        // Now find each placeholder in the fallback specification string and map it to
+        // the equivalent index for the path matcher we just created.
+        StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get());
+        do {
+            int placeholderPos = fallbackMatcher.start() + 1;
+            // The new ID is the index of the corresponding placeholder offset by '1'.
+            char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos);
+            int newPlaceholderIndex = groupIds.indexOf(placeholderDigit);
+            checkState(newPlaceholderIndex != -1,
+                "fallback values may only contain arguments from the resource bundle path: %s",
+                fallback.get());
+            rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex));
+        } while (fallbackMatcher.find());
+        return VarString.of(rewrittenFallbackSpec.toString());
+    }
+
+    /** Base class of either a matched or a fallback result. */
+    private abstract class AbstractResult extends Result {
+        // Split and resolved values for this result (see also "isGrouped()").
+        private final ImmutableList<String> values;
+
+        // The "source" CLDR path of a matched result (omitted if this is a fallback result).
+        // Note that this is the resolved "base_xpath" if it was specified in the instructions.
+        private final Optional<CldrPath> basePath;
+
+        // Calculated eagerly since we always expect results to need to be deduplicated.
+        private final int hashCode;
+
+        AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) {
+            super(key);
+            this.values = ImmutableList.copyOf(values);
+            this.basePath = checkNotNull(path);
+            // Same attributes in the same order as tested for in equals().
+            this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues());
+        }
+
+        // Returns the specification from which this result was obtained. This is essential for
+        // correct ordering and determining fallback values, but is not directly used for
+        // determining result equality (since duplicate results can be generated by different
+        // specifications).
+        final ResultSpec getSpec() {
+            return ResultSpec.this;
+        }
+
+        final Optional<CldrPath> getPath() {
+            return basePath;
+        }
+
+        final boolean wasMatched() {
+            // We could also do this via a boolean field.
+            return this instanceof MatchedResult;
+        }
+
+        @Override
+        public final ImmutableList<String> getValues() {
+            return values;
+        }
+
+        @Override
+        public final int compareTo(Result other) {
+            checkArgument(other instanceof AbstractResult,
+                "unknown result type: %s", other.getClass());
+            return RESULT_ORDERING.compare(this, (AbstractResult) other);
+        }
+
+        @Override
+        public final int hashCode() {
+            return hashCode;
+        }
+
+        // Equality semantics of results is ESSENTIAL for correct behaviour, especially the
+        // deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING.
+        @Override
+        public final boolean equals(Object obj) {
+            // Different subclasses are never equal, so test class directly (not instanceof).
+            if (obj == null || !getClass().equals(obj.getClass())) {
+                return false;
+            }
+            AbstractResult other = (AbstractResult) obj;
+            // DO NOT test the result specifier here. Equal results can be generated from
+            // different result specifications (if "base_xpath" was used).
+            return getKey().equals(other.getKey())
+                && getPath().equals(other.getPath())
+                && isGrouped() == other.isGrouped()
+                // Alternatively assert that values are equal if everything else is.
+                && getValues().equals(other.getValues());
+        }
+    }
+
+    // Result created for an explicit path match using captured arguments.
+    private final class MatchedResult extends AbstractResult {
+        MatchedResult(RbPath key, Iterable<String> values, CldrPath path) {
+            super(key, values, Optional.of(path));
+        }
+
+        @Override
+        public boolean isGrouped() {
+            // We don't need to use the "group" value at all and it can be removed from the
+            // configuration file at some point.
+            return instructions.containsKey(Instruction.GROUP);
+        }
+
+        @Override
+        public boolean isFallbackFor(Result r) {
+            // Matched results are never a fallback for anything.
+            return false;
+        }
+    }
+
+    // Result created to hold possible fallback values for a specified resource bundle path.
+    private final class FallbackResult extends AbstractResult {
+        FallbackResult(RbPath rbPath, Iterable<String> values) {
+            super(rbPath, values, Optional.empty());
+        }
+
+        // Delete this method and move the other one into AbstractResult if we decide to allow
+        // grouping for fallback values (it's not clear if it's a good idea).
+        @Override
+        public boolean isGrouped() {
+            return false;
+        }
+
+        @Override
+        public boolean isFallbackFor(Result r) {
+            // We are a fallback if we came from the same specification as a matched result.
+            // To prevent duplication of fallback results, we also return true if the result we
+            // are "equal()" to the given result (equivalent fallback results can come from
+            // different input paths).
+            checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r);
+            AbstractResult result = (AbstractResult) r;
+            return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result);
+        }
+    }
+
+    // ==== Static helper functions ====
+
+    // Matches any "$N" placeholder without capturing.
+    private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]");
+
+    // Turn "$N" into a capturing groups.
+    //
+    // Note that this code currently assumes that each "$N" placeholder matches a single path
+    // segment (i.e. the captured values cannot contain '/'). This is an artificial restriction
+    // since resource bundle paths can have quoting in, so we could detect quoted placeholders
+    // and allow any characters. However at the moment this isn't an issue, and none of the
+    // "$N" placeholders in the paths expects to match anything with '/' in.
+    //
+    // TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly.
+    private static Pattern getRbPathMatcher(String rbPathSpec) {
+        // An RbPath instance's toString() does not have a leading '/' on it, so well have to
+        // account for that here (or we could just remove the leading '/' from paths in the
+        // config file...
+        if (rbPathSpec.startsWith("/")) {
+            rbPathSpec = rbPathSpec.substring(1);
+        }
+        // Protect potential regex meta-characters in the original resource bundle path. Using
+        // '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that
+        // means we also need to handle '\E' in the original string (incredibly unlikely but it
+        // would be super hard to debug if it ever happened).
+        // TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify.
+        String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E";
+
+        // Remember that you could get "$1$2" here and the regex groups that replace them will
+        // abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case.
+        // We assume that the substituted arguments contained at least one character, and so we
+        // capture at least one character per group here.
+        regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q");
+        return Pattern.compile(regex);
+    }
+
+    private static String substituteArgs(String spec, List<String> args) {
+        return substituteArgs(spec, args::get, args.size());
+    }
+
+    // Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed
+    // function (i.e. "$N" --> args(N - 1)).
+    private static String substituteArgs(String spec, Function<Integer, String> args, int size) {
+        return RegexTransformer.substitute(
+            spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index")));
+    }
+
+    // Matches arguments with or without enclosing quotes.
+    private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?");
+
+    // Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9)
+    // and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is
+    // captured by the regex because it's just the entire path.
+    private static int getSplitArgIndex(String rbPath) {
+        // Captures a $N placeholder, but might catch surrounding quoting as well.
+        Matcher matcher = ARGUMENT.matcher(rbPath);
+        while (matcher.find()) {
+            char startChar = rbPath.charAt(matcher.start());
+            char endChar = rbPath.charAt(matcher.end() - 1);
+            // Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N".
+            // Q: Why two different "quoting" schemes?
+            // A: It's complex and relates the something called "hidden labels".
+            boolean shouldSplit = !((startChar == '"' && endChar == '"') ||
+                (startChar == '<' && endChar == '>'));
+            if (shouldSplit) {
+                // Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and
+                // arguments are zero-indexed, so we expect an index from 0 to 8.
+                int groupNumber = Integer.parseInt(matcher.group(1));
+                checkArgument(groupNumber >= 1 && groupNumber <= 9,
+                    "invalid split argument: %s", groupNumber);
+                return groupNumber - 1;
+            }
+        }
+        return -1;
+    }
+
+    // Splits a possibly quoted string, where we need to handle \". This is a bit dubious
+    // though as we don't detect or unescape \\. Thus it's impossible to represent a single '\'
+    // at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'.
+    // It's also impossible to have a value that should be split but which contains '"'.
+    //
+    // This mimics the original RegexManager behaviour where spaces in and quotes in
+    // substituted values are _not_ escaped.
+    private static ImmutableList<String> splitValues(String value) {
+        int qstart = nextBareQuoteIndex(value,  0);
+        if (qstart == -1) {
+            return ImmutableList.copyOf(VALUE_SPLITTER.split(value));
+        }
+        ImmutableList.Builder<String> values = ImmutableList.builder();
+        int rawStart = 0;
+        do {
+            values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart)));
+            int qend = nextBareQuoteIndex(value,  qstart + 1);
+            checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value);
+            // Remember to unescape any '"' found in the quoted regions.
+            values.add(value.substring(qstart + 1, qend).replace("\\\"", "\""));
+            rawStart = qend + 1;
+            qstart = nextBareQuoteIndex(value,  qend + 1);
+        } while (qstart != -1);
+        values.addAll(VALUE_SPLITTER.split(value.substring(rawStart)));
+        return values.build();
+    }
+
+    // Returns the index of the next '"' character that's not preceded by a '\'.
+    private static int nextBareQuoteIndex(String s, int i) {
+        i = s.indexOf('"', i);
+        // If i == 0, then '"' is the first char and must be "bare".
+        if (i > 0) {
+            do {
+                if (s.charAt(i - 1) != '\\') {
+                    break;
+                }
+                i = s.indexOf('\\', i + 1);
+            } while (i >= 0);
+        }
+        return i;
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Rule.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Rule.java

new file mode 100644 (file)

index 0000000..7f51188
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Rule.java
@@ -0,0 +1,180 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.util.Optional;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/*
+ * Each rule corresponds to a single target xpath specification in the configuration file
+ * (lines starting //) but may have more than one result specification. For example:
+ *
+ * //supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
+ *      ; /languageData/$1/primary/scripts ; values=$2
+ *      ; /languageData/$1/primary/territories; values=$3
+ *
+ * is represented by a single rule with two result specifications.
+ */
+abstract class Rule {
+    /** Returns a rule for which all '%X' arguments have been resolved (almost all cases). */
+    static Rule staticRule(
+        CldrDataType dtdType,
+        String prefix,
+        Iterable<ResultSpec> specs,
+        String pathRegex,
+        String xpathSpec,
+        int lineNumber) {
+
+        return new StaticRule(dtdType, prefix, specs, pathRegex, xpathSpec, lineNumber);
+    }
+
+    /** Returns a rule for which some '%X' arguments are unresolved until matching occurs. */
+    static Rule dynamicRule(
+        CldrDataType dtdType,
+        String pathRegex,
+        Iterable<ResultSpec> specs,
+        VarString varString,
+        Function<Character, CldrPath> varFn,
+        String xpathSpec,
+        int lineNumber) {
+
+        return new DynamicRule(dtdType, pathRegex, specs, varString, varFn, xpathSpec, lineNumber);
+    }
+
+    // Type of CLDR path which can match this rule.
+    private final CldrDataType dtdType;
+    // The first path element below the root, used to do fast rejection of non-matching paths
+    // and to "bucket" rules by their prefix to speed up matching.
+    private final String pathPrefix;
+    // One or more result specifications to be processed for matching CLDR paths/values.
+    private final ImmutableList<ResultSpec> resultSpecs;
+    // Debug information only to help determine unused rules.
+    private final String xpathSpec;
+    private final int lineNumber;
+
+    private Rule(
+        CldrDataType dtdType,
+        String pathPrefix,
+        Iterable<ResultSpec> resultSpecs,
+        String xpathSpec,
+        int lineNumber) {
+
+        this.dtdType = checkNotNull(dtdType);
+        this.pathPrefix = checkNotNull(pathPrefix);
+        this.resultSpecs = ImmutableList.copyOf(resultSpecs);
+        this.xpathSpec = checkNotNull(xpathSpec);
+        this.lineNumber = lineNumber;
+    }
+
+    /** Returns the CLDR DTD type of the path that the rule can match. */
+    final CldrDataType getDataType() {
+        return dtdType;
+    }
+
+    /** Returns the name of the first path element below the path root. */
+    final String getPathPrefix() {
+        return pathPrefix;
+    }
+
+    /** Returns the regular expression against which CLDR path strings are matched. */
+    abstract Pattern getPathPattern(DynamicVars varLookupFn);
+
+    /**
+     * Attempts to match the incoming xpath and (if successful) use captured arguments to
+     * generate one result for each result specification.
+     */
+    final ImmutableList<Result> transform(CldrValue v, String fullXPath, DynamicVars varFn) {
+        Matcher m = getPathPattern(varFn).matcher(fullXPath);
+        return m.matches()
+            ? resultSpecs.stream()
+                .flatMap(r -> r.transform(v, m, varFn))
+                .collect(toImmutableList())
+            : ImmutableList.of();
+    }
+
+    /**
+     * Returns any fallback functions defined in results specifications. These are used to
+     * determine the set of possible fallback values for a given resource bundle path.
+     */
+    final Stream<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunctions() {
+        return resultSpecs.stream()
+            .map(ResultSpec::getFallbackFunction)
+            .filter(Optional::isPresent)
+            .map(Optional::get);
+    }
+
+    // Debugging only
+    final String getXpathSpec() {
+        return xpathSpec;
+    }
+
+    // Debugging only
+    final int getLineNumber() {
+        return lineNumber;
+    }
+
+    private static final class StaticRule extends Rule {
+        // The processed xpath specification yielding an xpath matching regular expression. This is
+        // only suitable for matching incoming xpaths and cannot be processed in any other way.
+        private final Pattern xpathPattern;
+
+        StaticRule(
+            CldrDataType dtdType,
+            String prefix,
+            Iterable<ResultSpec> specs,
+            String pathRegex,
+            String xpathSpec,
+            int lineNumber) {
+
+            super(dtdType, prefix, specs, xpathSpec, lineNumber);
+            this.xpathPattern = Pattern.compile(pathRegex);
+        }
+
+        @Override
+        Pattern getPathPattern(DynamicVars varLookupFn) {
+            return xpathPattern;
+        }
+    }
+
+    private static final class DynamicRule extends Rule {
+        // The processed xpath specification yielding an xpath matching regular expression. This is
+        // only suitable for matching incoming xpaths and cannot be processed in any other way.
+        private final VarString varString;
+        private final Function<Character, CldrPath> dynamicVarFn;
+
+        DynamicRule(
+            CldrDataType dtdType,
+            String prefix,
+            Iterable<ResultSpec> specs,
+            VarString varString,
+            Function<Character, CldrPath> varFn,
+            String xpathSpec,
+            int lineNumber) {
+
+            super(dtdType, prefix, specs, xpathSpec, lineNumber);
+            this.varString = checkNotNull(varString);
+            this.dynamicVarFn = checkNotNull(varFn);
+        }
+
+        @Override Pattern getPathPattern(DynamicVars varLookupFn) {
+            String pathRegex = varString.apply(dynamicVarFn.andThen(varLookupFn)).get();
+            return Pattern.compile(pathRegex);
+        }
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RuleParser.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RuleParser.java

new file mode 100644 (file)

index 0000000..01fb46c
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RuleParser.java
@@ -0,0 +1,152 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static com.google.common.collect.Maps.filterValues;
+import static com.google.common.collect.Maps.transformValues;
+import static java.util.function.Function.identity;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.escape.CharEscaperBuilder;
+import com.google.common.escape.Escaper;
+
+/** Parser for rule specifications in the regex transformer configuration files. */
+final class RuleParser {
+    // Pattern to capture first two path elements (for the dtd type and path prefix).
+    private static final Pattern PATH_SPEC_PREFIX = Pattern.compile("//([^/]+)/([^/]+)/");
+
+    // Preprocessing replaces %X variables defined in the configuration file. This helps to
+    // keep the path specification a bit easier to read.
+    private static final Pattern VAR = Pattern.compile("^%([A-Z])=(.*)$");
+
+    // Multi-line rules start with " ; " for some optional amount of whitespace.
+    private static final Pattern RULE_PARTS_SEPERATOR = Pattern.compile("\\s*+;\\s*+");
+
+    // Splitter for the resource bundle / value declarations.
+    private static final Splitter RULE_PARTS_SPLITTER =
+        Splitter.on(RULE_PARTS_SEPERATOR).trimResults(whitespace()).omitEmptyStrings();
+
+    // Splitter for instruction name/expressions.
+    private static final Splitter INSTRUCTION_SPLITTER =
+        Splitter.on('=').trimResults(whitespace()).limit(2);
+
+    // Only '[',']' need escaping in path specifications (so we can write "foo{@bar="baz"]").
+    private static final Escaper SPECIAL_CHARS_ESCAPER =
+        new CharEscaperBuilder().addEscape('[', "\\[").addEscape(']', "\\]").toEscaper();
+
+    /** Parses a configuration file to create a sequence of transformation rules. */
+    static ImmutableList<Rule> parseConfig(
+        List<String> configLines, List<NamedFunction> functions) {
+        // Extract '%X' variable declarations in the first pass.
+        ImmutableMap<Character, String> varMap = configLines.stream()
+            .filter(s -> s.startsWith("%"))
+            .map(VAR::matcher)
+            .peek(m -> checkArgument(m.matches(), "invalid argument declaration: %s", m))
+            .collect(ImmutableMap.toImmutableMap(m -> m.group(1).charAt(0), m -> m.group(2)));
+        return new RuleParser(varMap, functions).parseLines(configLines);
+    }
+
+    private final ImmutableMap<Character, String> staticVarMap;
+    private final ImmutableMap<Character, CldrPath> dynamicVarMap;
+    private final ImmutableMap<String, NamedFunction> fnMap;
+
+    private RuleParser(ImmutableMap<Character, String> varMap, List<NamedFunction> functions) {
+        this.staticVarMap = ImmutableMap.copyOf(filterValues(varMap, s -> !s.startsWith("//")));
+        this.dynamicVarMap = ImmutableMap.copyOf(
+            transformValues(
+                filterValues(varMap, s -> s.startsWith("//")),
+                CldrPath::parseDistinguishingPath));
+        this.fnMap =
+            functions.stream().collect(toImmutableMap(NamedFunction::getName, identity()));
+    }
+
+    private ImmutableList<Rule> parseLines(List<String> configLines) {
+        List<Rule> rules = new ArrayList<>();
+        for (int lineIndex = 0; lineIndex < configLines.size(); lineIndex++) {
+            String line = configLines.get(lineIndex);
+            try {
+                if (line.startsWith("//")) {
+                    // Either it's "//xpath ; resource-bundle-path ; values"
+                    // Or "//xpath" with " ; resource-bundle-path ; values" on subsequent lines.
+                    int ruleLineNumber = lineIndex + 1;
+                    int xpathEnd = line.indexOf(";");
+                    String xpath;
+                    List<ResultSpec> specs = new ArrayList<>();
+                    if (xpathEnd != -1) {
+                        // Single line rule, extract result specification from trailing part.
+                        xpath = whitespace().trimFrom(line.substring(0, xpathEnd));
+                        // Keep leading " ; " in the transformation string since it matches the
+                        // multi-rule case and is handled the same.
+                        specs.add(parseResultSpec(line.substring(xpathEnd), lineIndex + 1));
+                    } else {
+                        xpath = line;
+                        while (++lineIndex < configLines.size()
+                            && RULE_PARTS_SEPERATOR.matcher(configLines.get(lineIndex)).lookingAt()) {
+                            specs.add(parseResultSpec(configLines.get(lineIndex), lineIndex + 1));
+                        }
+                        // The loop above moved us past the last line of the rule, so readjust.
+                        lineIndex--;
+                    }
+                    rules.add(parseRule(xpath, specs, ruleLineNumber));
+                }
+            } catch (Exception e) {
+                throw new RuntimeException(
+                    String.format("parse error at line %d: %s", lineIndex + 1, line), e);
+            }
+        }
+        return ImmutableList.copyOf(rules);
+    }
+
+    private ResultSpec parseResultSpec(String spec, int lineNumber) {
+        // The result specifier still has leading separator (e.g. " ; /foo/bar/$1 ; value=$2"),
+        // but that's okay because the splitter ignores empty results.
+        List<String> rbPathAndInstructions = RULE_PARTS_SPLITTER.splitToList(spec);
+        String rbPathSpec = rbPathAndInstructions.get(0);
+
+        ImmutableMap<Instruction, VarString> instructions =
+            rbPathAndInstructions.stream()
+                .skip(1)
+                .map(INSTRUCTION_SPLITTER::splitToList)
+                .collect(toImmutableMap(
+                    p -> Instruction.forId(p.get(0)),
+                    p -> VarString.of(p.size() > 1 ? p.get(1) : "", staticVarMap::get)));
+        return new ResultSpec(rbPathSpec, instructions, lineNumber, fnMap, dynamicVarMap::get);
+    }
+
+    private Rule parseRule(String xpathSpec, List<ResultSpec> resultSpecs, int lineNumber) {
+        // The escaped path is nearly a regular expression, but still contains '%X' variables.
+        String escapedPathSpec = SPECIAL_CHARS_ESCAPER.escape(xpathSpec);
+        Matcher m = PATH_SPEC_PREFIX.matcher(escapedPathSpec);
+        checkArgument(m.lookingAt(), "unexpected path spec: %s", escapedPathSpec);
+
+        // Extract type a path prefix for rule grouping and fast rejection during matching.
+        CldrDataType dtdType = CldrDataType.forXmlName(m.group(1));
+        String pathPrefix = m.group(2);
+
+        // If the variable string contains a "dynamic" argument, is cannot be resolved yet and
+        // must result in a "dynamic" rule being created here (this is very rare though).
+        VarString varString = VarString.of(escapedPathSpec, staticVarMap::get);
+        Optional<String> resolved = varString.resolve();
+        // Don't turn this into a "map().orElse()" chain (despite what your IDE might suggest)
+        // because we don't want to create lots of unused dynamic rules!
+        return resolved.isPresent()
+            ? Rule.staticRule(
+                dtdType, pathPrefix, resultSpecs, resolved.get(), xpathSpec, lineNumber)
+            : Rule.dynamicRule(
+                dtdType, pathPrefix, resultSpecs, varString, dynamicVarMap::get, xpathSpec, lineNumber);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/VarString.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/VarString.java

new file mode 100644 (file)

index 0000000..1df3599
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/VarString.java
@@ -0,0 +1,90 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.Optional;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * An immutable representation of a String with placeholders for variable substitution. A
+ * VarString can be "resolved" or "partially resolved" by providing a mapping from placeholder
+ * characters to strings, and any remaining unresolved variables are tracked. This is a very
+ * private bit of implementation detail with a far from ideal API, so it's probably best not to
+ * use it elsewhere without careful thought.
+ */
+final class VarString {
+    private static final CharMatcher VAR_CHAR = CharMatcher.inRange('A', 'Z');
+
+    static VarString of(String varString) {
+        ImmutableSet.Builder<Character> requiredChars = ImmutableSet.builder();
+        // Variable placeholders are any % followed by upper-case ASCII letter (A-Z).
+        // Other '%' chars are ignored.
+        for (int i = 0; i < varString.length() - 1; i++) {
+            if (varString.charAt(i) == '%') {
+                char c = varString.charAt(i + 1);
+                if (VAR_CHAR.matches(c)) {
+                    requiredChars.add(c);
+                }
+            }
+        }
+        return new VarString(varString, requiredChars.build(), ImmutableMap.of());
+    }
+
+    static VarString of(String s, Function<Character, String> varFn) {
+        return of(s).apply(varFn);
+    }
+
+    private final String varString;
+    private final ImmutableSet<Character> requiredChars;
+    private final ImmutableMap<Character, String> varMap;
+
+    private VarString(
+        String varString,
+        ImmutableSet<Character> requiredChars,
+        ImmutableMap<Character, String> varMap) {
+        this.varString = checkNotNull(varString);
+        this.requiredChars = checkNotNull(requiredChars);
+        this.varMap = checkNotNull(varMap);
+    }
+
+    /** Applies a variable function to produce a new, potentially resolved, VarString. */
+    VarString apply(Function<Character, String> varFn) {
+        ImmutableMap.Builder<Character, String> newVarMap = ImmutableMap.builder();
+        newVarMap.putAll(this.varMap);
+        for (Character c : requiredChars) {
+            if (!varMap.containsKey(c)) {
+                // Allowed to return null if the function cannot resolve a variable.
+                String v = varFn.apply(c);
+                if (v != null) {
+                    newVarMap.put(c, v);
+                }
+            }
+        }
+        return new VarString(varString, requiredChars, newVarMap.build());
+    }
+
+    /** Returns a resolved value if all variables are available for substitution. */
+    Optional<String> resolve() {
+        return varMap.keySet().equals(requiredChars)
+            ? Optional.of(
+                RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c)))
+            : Optional.empty();
+    }
+
+    /** Returns the resolved value or fails if not all variables are available. */
+    String get() {
+        checkState(varMap.keySet().equals(requiredChars), "unresolved variable string: %s", this);
+        return RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c));
+    }
+
+    @Override public String toString() {
+        return varString + ": " + varMap;
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt

new file mode 100644 (file)

index 0000000..a9bd38f
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt
@@ -0,0 +1,2 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+\ No newline at end of file
diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt

new file mode 100644 (file)

index 0000000..db42103
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt
@@ -0,0 +1,350 @@
+# ldml2icu_locale.txt
+#
+# © 2016 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+#
+# Used by LdmlLocaleMapper.
+# Data-driven file for mapping LDML locale paths to ICU paths.
+# See ldml2icu_readme.txt for a detailed explanation of this file.
+
+# Variables
+# Attribute value
+%A=[^"']++
+# Word
+%W=[\w\-]++
+# Greedy word match
+%G=[\w\-]+
+# Number match
+%N=\d++
+# The default numbering system to be used.
+%D=//ldml/numbers/defaultNumberingSystem
+
+# Main locale data
+
+# Aliases
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/alias[@source="locale"][@path="../calendar[@type='(%A)']"]
+     ; /calendar/$1lo ; values=/LOCALE/calendar/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dayPeriods"]
+     ; /calendar/$1/AmPmMarkers:alias ; values=/LOCALE/calendar/$2/AmPmMarkers
+     ; /calendar/$1/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/$2/AmPmMarkersNarrow
+     ; /calendar/$1/NoonMarker:alias ; values=/LOCALE/calendar/$2/NoonMarker
+     ; /calendar/$1/NoonMarkerNarrow:alias ; values=/LOCALE/calendar/$2/NoonMarkerNarrow
+
+//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
+     ; /calendar/gregorian/AmPmMarkers:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
+//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
+     ; /calendar/gregorian/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(eras|quarters|cyclicNameSets|monthPatterns)/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
+     ; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/alias[@source="locale"][@path="../eraAbbr"]
+     ; /calendar/$1/eras/narrow:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/alias[@source="locale"][@path="../eraAbbr"]
+     ; /calendar/$1/eras/wide:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/alias[@source="locale"][@path="../\2[@type='(%A)']"]
+     ; /calendar/$1/$2s/$3:alias ; values=/LOCALE/calendar/$1/$2s/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../../../\4Set[@type='(%A)']/\4Context[@type='(%A)']/\4Width[@type='(%A)']"]
+     ; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$7/$8/$9
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../\4Width[@type='(%A)']"]
+     ; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5/$7
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet|monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
+     ; /calendar/$1/$2s/$3/$4:alias    ; values=/LOCALE/calendar/$1/$2s/$5/$6
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
+     ; /calendar/$1/$2Names:alias ; values=/LOCALE/calendar/$3/$2Names
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
+     ; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$3/$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
+     ; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$5/$6
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
+     ; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateFormats"]
+     ; /calendar/$1/DateTimePatterns:alias ; values=/LOCALE/calendar/$2/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateTimeFormats"]
+     ; /calendar/$1/availableFormats:alias ; values=/LOCALE/calendar/$2/availableFormats
+     ; /calendar/$1/appendItems:alias      ; values=/LOCALE/calendar/$2/appendItems
+     ; /calendar/$1/intervalFormats:alias  ; values=/LOCALE/calendar/$2/intervalFormats
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/(availableFormats|appendItems|intervalFormats)/alias[@source="locale"][@path="../../../calendar[@type='(%A)']/dateTimeFormats/\2"]
+     ; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
+
+//ldml/units/unitLength[@type="long"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
+     ; /units:alias ; values=/LOCALE/unitsShort
+//ldml/units/unitLength[@type="narrow"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
+     ; /unitsNarrow:alias ; values=/LOCALE/unitsShort
+
+//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern"]
+     ; /listPattern/$1/start:alias  ; values=/LOCALE/listPattern/standard/start
+     ; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/standard/middle
+     ; /listPattern/$1/end:alias    ; values=/LOCALE/listPattern/standard/end
+     ; /listPattern/$1/2:alias      ; values=/LOCALE/listPattern/standard/2
+//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern[@type='(%A)']"]
+     ; /listPattern/$1/start:alias  ; values=/LOCALE/listPattern/$2/start
+     ; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/$2/middle
+     ; /listPattern/$1/end:alias    ; values=/LOCALE/listPattern/$2/end
+     ; /listPattern/$1/2:alias      ; values=/LOCALE/listPattern/$2/2
+
+//ldml/numbers/currencyFormats[@numberSystem="(%A)"]/currencyFormatLength/currencyFormat[@type="accounting"]/alias[@source="locale"][@path="../(%W)[@type='standard']"] ; /NumberElements/$1/patterns/accountingFormat:alias ; values=/LOCALE/NumberElements/$1/patterns/$2
+
+# Characters
+
+//ldml/characters/exemplarCharacters[@type="auxiliary"]      ; /AuxExemplarCharacters
+//ldml/characters/exemplarCharacters[@type="currencySymbol"] ; /ExemplarCharactersCurrency
+//ldml/characters/exemplarCharacters[@type="index"]          ; /ExemplarCharactersIndex
+//ldml/characters/exemplarCharacters[@type="punctuation"]    ; /ExemplarCharactersPunctuation
+//ldml/characters/exemplarCharacters[@type="numbers"]        ; /ExemplarCharactersNumbers
+//ldml/characters/exemplarCharacters                         ; /ExemplarCharacters
+
+//ldml/characters/ellipsis[@type="(%A)"]                     ; /Ellipsis/$1
+//ldml/characters/moreInformation                            ; /MoreInformation
+//ldml/characters/special/icu:scripts/icu:script[@type="%N"] ; /LocaleScript
+
+//ldml/characters/parseLenients[@scope="(%A)"][@level="(%A)"]/parseLenient[@sample="%A"] ; /parse/$1/$2
+
+# Defaults
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/\2[@type="(%W)"]
+     ; /calendar/$1/$2s/$3/$4/$5
+
+# Dates
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/cyclicNameSets/cyclicNameSet[@type="(%A)"]/cyclicNameContext[@type="(%A)"]/cyclicNameWidth[@type="(%A)"]/cyclicName[@type="(%A)"]
+     ; /calendar/$1/cyclicNameSets/$2/$3/$4 ;
+
+# ---- /calendar/xxx/DateTimePatterns
+# Rules are split to force manual ordering within the array produced by them (they share the same output path).
+#
+# Note that (like many other places) the uncaptured "type" attributes are just expected to be "standard", and the %A
+# variable is only used to save a bit of space. The final output array has 3 groups ("time" -> "date" -> "date-time")
+# each with 4 elements in based on the pattern length ("full" -> "long" -> "medium" -> "short") giving 12 patterns in
+# total.
+#
+# However due to an awful hack, there end up being 13 values in the array, with the medium date-time value being
+# duplicated at index 8. However this hack is done later, because the regex transformer does not permit the same
+# CLDR path to emit values in different places in an array.
+
+# Time patterns (4 x values)
+//ldml/dates/calendars/calendar[@type="(%A)"]/(timeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+
+# Date patterns (4 x values)
+#
+# This is a weird edge case. When the number attribute is present in the xpath, its value needs to be grouped
+# together with the xpath value in its own special array, which is treated like just another value in
+# /DateTimePatterns. The group keyword is used here to specify that values from the same xpath should be grouped
+# into their own separate array. Since each possible pattern length can have patterns with and without the number
+# attribute, we must explicitly split the rules to enforce correct output order.
+#
+# So far (Jan 2014), this only happens in the Chinese calendar for ja/zh/zh_Hant and the Hebrew calendar for he,
+# and all calendars for haw (which has numbers="M=romanlow").
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+     ; /calendar/$1/DateTimePatterns ;  values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+     ; /calendar/$1/DateTimePatterns ;  values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+     ; /calendar/$1/DateTimePatterns ;  values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+     ; /calendar/$1/DateTimePatterns ;  values="{value}" $4 ; group
+
+# DateTime patterns (4 x values)
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateTimeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
+     ; /calendar/$1/DateTimePatterns
+# ----
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/appendItems/appendItem[@request="(%A)"] ; /calendar/$1/appendItems/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"]                ; /calendar/$1/availableFormats/$2
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"][@count="(%A)"] ; /calendar/$1/availableFormats/$2/$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id="(%A)"]/greatestDifference[@id="(%A)"] ; /calendar/$1/intervalFormats/$2/$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatFallback                                        ; /calendar/$1/intervalFormats/fallback
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"]        ; /calendar/$1/AmPmMarkers%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"]      ; /calendar/$1/AmPmMarkersNarrow%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersAbbr%$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"]        ; /calendar/$1/AmPmMarkers
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersAbbr
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"]      ; /calendar/$1/AmPmMarkersNarrow
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"]              ; /calendar/$1/dayPeriod/$2/$3/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"][@alt="(%A)"]    ; /calendar/$1/dayPeriod/$2/$3/$4%$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"]                 ; /calendar/$1/dayPeriod/$2/$3/$4
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/narrow%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"][@alt="(%A)"]   ; /calendar/$1/eras/abbreviated%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"][@alt="(%A)"]  ; /calendar/$1/eras/wide%$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"] ; /calendar/$1/eras/narrow
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"]   ; /calendar/$1/eras/abbreviated
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"]  ; /calendar/$1/eras/wide
+
+# Leap year names go after other month names.
+# "yeartype" is an #IMPLIED attribute in the DTD and it should implicitly default to "standard".
+# In practice "standard" is never explicitly given, but it could be (so must match it here).
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"](?:[@yeartype="standard"])? ; /calendar/$1/$2Names/$3/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"][@yeartype="leap"] ; /calendar/$1/$2Names/$3/$4
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(quarters)/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="%A"] ; /calendar/$1/$2/$3/$4
+
+//ldml/dates/fields/field[@type="(%A)"]/displayName[@alt="(%A)"] ; /fields/$1/dn%$2
+//ldml/dates/fields/field[@type="(%A)"]/displayName ; /fields/$1/dn
+//ldml/dates/fields/field[@type="(%A)"]/relative[@type="(%A)"] ; /fields/$1/relative/"$2"
+//ldml/dates/fields/field[@type="(%A)"]/relativePeriod ; /fields/$1/relativePeriod
+//ldml/dates/fields/field[@type="(%A)"]/relativeTime[@type="(%A)"]/relativeTimePattern[@count="(%A)"] ; /fields/$1/relativeTime/$2/$3
+
+//ldml/dates/fields/field[@type="(%A)"]/alias[@source="locale"][@path="../field[@type='(%A)']"] ; /fields/$1:alias ; values=/LOCALE/fields/$2
+
+//ldml/dates/timeZoneNames/regionFormat[@type="daylight"]   ; /zoneStrings/regionFormatDaylight
+//ldml/dates/timeZoneNames/regionFormat[@type="standard"]   ; /zoneStrings/regionFormatStandard
+//ldml/dates/timeZoneNames/(%GFormat)   ; /zoneStrings/$1
+
+//ldml/dates/timeZoneNames/metazone[@type="(%A)"]/(\w)%W/(\w)%W ; /zoneStrings/"meta:$1"/$2$3
+
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2"/ec%$3
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2"/ec
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/(\w)%W/(\w)%W ; /zoneStrings/"$1:$2"/$3$4
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2:$3"/ec%$4
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2:$3"/ec
+
+# Locale Display Names
+
+//ldml/localeDisplayNames/codePatterns/codePattern[@type="(%A)"] ; /codePatterns/$1
+//ldml/localeDisplayNames/annotationPatterns/annotationPattern[@type="(%A)"] ; /codePatterns/$1
+
+//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
+
+//ldml/localeDisplayNames/languages/language[@type="(%A)"][@alt="(%A)"] ; /Languages%$2/$1
+//ldml/localeDisplayNames/languages/language[@type="(%A)"] ; /Languages/$1
+
+//ldml/localeDisplayNames/localeDisplayPattern/localeKeyTypePattern ; /localeDisplayPattern/keyTypePattern
+//ldml/localeDisplayNames/localeDisplayPattern/localePattern ; /localeDisplayPattern/pattern
+//ldml/localeDisplayNames/localeDisplayPattern/localeSeparator ; /localeDisplayPattern/separator
+
+//ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type="(%A)"] ; /measurementSystemNames/$1
+
+//ldml/localeDisplayNames/scripts/script[@type="(%A)"][@alt="(%A)"] ; /Scripts%$2/$1
+//ldml/localeDisplayNames/scripts/script[@type="(%A)"] ; /Scripts/$1
+
+//ldml/localeDisplayNames/territories/territory[@type="(%A)"][@alt="(%A)"] ; /Countries%$2/$1
+//ldml/localeDisplayNames/territories/territory[@type="(%A)"] ; /Countries/$1
+
+//ldml/localeDisplayNames/transformNames/transformName[@type="(%W)"] ; /transformNames/$1
+
+//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"][@alt="(%A)"] ; /Types%$3/$1/$2
+//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"] ; /Types/$1/$2
+
+//ldml/localeDisplayNames/variants/variant[@type="(%A)"][@alt="(%A)"] ; /Variants%$2/$1
+//ldml/localeDisplayNames/variants/variant[@type="(%A)"] ; /Variants/$1
+
+# Numbers
+
+//ldml/numbers/currencies/currency[@type="(%A)"]/displayName[@count="(%A)"] ; /CurrencyPlurals/$1/$2
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol[@alt="(%A)"] ; /Currencies%$2/$1
+
+# ---- /Currencies/XXX bundles
+# Ordering of rules is critical here since they write into the same resource bundle path and the
+# last 3 values are grouped together as a single value (via the special <FIFO> hidden label).
+#
+# Note that the <FIFO> label is needed here (not the "group" instruction) because the grouped
+# values must be seen as having a resource bundle path that is a child of the "/Currencies/$1"
+# path. This is so that the grouped values only appear when one of them is present rather than
+# whenever any of the other values in the main resource bundle path exist.
+#
+# Due to the optional nature of the final sub-array in the bundle, it would be very hard to ever
+# add more elements after it.
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
+     ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
+     ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/pattern[@type="standard"]
+     ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencyFormatLength/currencyFormat[@type="standard"]/pattern[@type="standard"]
+//ldml/numbers/currencies/currency[@type="(%W)"]/decimal
+     ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
+//ldml/numbers/currencies/currency[@type="(%W)"]/group
+     ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/group
+# ----
+
+//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencySpacing/(%W)/(%W)  ; /currencySpacing/$1/$2
+//ldml/numbers/currencyFormats[@numberSystem="%D"]/unitPattern[@count="(%W)"] ; /CurrencyUnitPatterns/$1
+
+//ldml/numbers/defaultNumberingSystem[@alt="(%A)"] ; /NumberElements/default_$1
+//ldml/numbers/defaultNumberingSystem              ; /NumberElements/default
+//ldml/numbers/minimumGroupingDigits               ; /NumberElements/minimumGroupingDigits
+//ldml/numbers/otherNumberingSystems/(%W)          ; /NumberElements/$1
+
+//ldml/numbers/symbols[@numberSystem="(%A)"]/(%W) ; /NumberElements/$1/symbols/$2
+//ldml/numbers/(%GFormat)s[@numberSystem="(%W)"]/\1Length/\1[@type="standard"]/pattern[@type="standard"] ; /NumberElements/$2/patterns/$1
+//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength/currencyFormat[@type="accounting"]/pattern[@type="standard"] ; /NumberElements/$1/patterns/accountingFormat
+//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength[@type="short"]/currencyFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/currencyFormat/$2/$3
+//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="short"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/decimalFormat/$2/$3
+//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="long"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsLong/decimalFormat/$2/$3
+
+//ldml/numbers/miscPatterns[@numberSystem="(%W)"]/pattern[@type="(%W)"] ; /NumberElements/$1/miscPatterns/$2
+//ldml/numbers/minimalPairs/ordinalMinimalPairs[@ordinal="(%A)"] ; /NumberElements/minimalPairs/ordinal/$1
+//ldml/numbers/minimalPairs/pluralMinimalPairs[@count="(%A)"] ; /NumberElements/minimalPairs/plural/$1
+
+# Misc
+
+# Ordering of rules is critical here since they write into the same resource bundle path.
+//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="uiListOrMenu"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
+//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="stand-alone"]  ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
+
+//ldml/delimiters/(%W) ; /delimiters/$1
+
+//ldml/layout/orientation/(%G)Order ; /layout/$1s
+
+//ldml/listPatterns/listPattern/listPatternPart[@type="(%A)"] ; /listPattern/standard/$1
+//ldml/listPatterns/listPattern[@type="(%A)"]/listPatternPart[@type="(%A)"] ; /listPattern/$1/$2
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsNarrow/$1/$2/dnam
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsShort/$1/$2/dnam
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/displayName ; /units/$1/$2/dnam
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsNarrow/$1/$2/$3
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsShort/$1/$2/$3
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /units/$1/$2/$3
+
+//ldml/units/unitLength[@type="narrow"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsNarrow/compound/$1
+//ldml/units/unitLength[@type="short"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsShort/compound/$1
+//ldml/units/unitLength[@type="long"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /units/compound/$1
+
+//ldml/units/unitLength[@type="narrow"]/coordinateUnit/displayName ; /unitsNarrow/coordinate/dnam
+//ldml/units/unitLength[@type="short"]/coordinateUnit/displayName ; /unitsShort/coordinate/dnam
+//ldml/units/unitLength[@type="long"]/coordinateUnit/displayName ; /units/coordinate/dnam
+
+//ldml/units/unitLength[@type="narrow"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsNarrow/coordinate/$1
+//ldml/units/unitLength[@type="short"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsShort/coordinate/$1
+//ldml/units/unitLength[@type="long"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /units/coordinate/$1
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsNarrow/$1/$2/per
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsShort/$1/$2/per
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /units/$1/$2/per
+
+//ldml/units/durationUnit[@type="(%A)"]/durationUnitPattern ; /durationUnits/$1
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsNarrow/$1/$2:alias ; values=/LOCALE/unitsNarrow/$3/$4
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsShort/$1/$2:alias ; values=/LOCALE/unitsShort/$3/$4
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /units/$1/$2:alias ; values=/LOCALE/units/$3/$4
+
+//ldml/characterLabels/characterLabelPattern[@type="(%A)"][@count="(%A)"] ; /characterLabelPattern/$1/$2
+//ldml/characterLabels/characterLabelPattern[@type="(%A)"] ; /characterLabelPattern/$1 
+//ldml/characterLabels/characterLabel[@type="(%A)"] ; /characterLabel/$1 
diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt

new file mode 100644 (file)

index 0000000..0db6e92
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt
@@ -0,0 +1,386 @@
+# README for configuration files used by org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer.
+#
+# © 2019 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+
+======
+Basics
+======
+
+The RegexTransformer class converts CLDR paths and values to ICU Resource Bundle paths
+and values, based on a set of transformation rules typically loaded from a text file
+(e.g. ldml2icu_locale.txt).
+
+The basic format of transformation rules is:
+  <path-specification> ; <resource-bundle-specification> [; <instruction>=<argument>]*
+
+A simple example of a transformation rule is:
+
+  //ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
+
+which transforms CLDR values whose path matches the path specification, and emits:
+* A resource bundle path "/Keys/xx", where 'xx' is the captured type attribute.
+* A resource bundle value, which is just the CLDR value's base value.
+
+A path specification can be thought of as a regular expression which matches the CLDR
+path and can capture some element names or attribute values; however unlike a regular
+expression, the '[',']' characters are treated as literals, similar to XPath expressions.
+
+If a single CLDR value should produce more than one resource bundle path/value, then
+it should be written:
+
+  <path-specification>
+     ; <resource-bundle-1-specification> [; <instruction> ]*
+     ; <resource-bundle-2-specification> [; <instruction> ]*
+
+=====================
+Argument Substitution
+=====================
+
+Before a rule can be matched, any %-variables must be substituted. These are defined
+in the same configuration file as the rules, and look something like:
+  %W=[\w\-]++
+or:
+  %D=//ldml/numbers/defaultNumberingSystem
+
+The first case can be thought of as just a snippet of regular expression (in this case
+something that matches hyphen separated words) and, importantly, here '[' and ']' are
+treated as regular expression metacharacters. These arguments are static and wil be
+substituted exactly as-is into the regular expression to be used for matching.
+
+The second case (used exactly once) is a dynamic argument which references a CLDR value
+in the set of data being transformed. This is simply indicated by the fact that it starts
+with '//'. This path is resolved and the value is substituted just prior to matching.
+
+Variable names are limited to a single upper-case letter (A-Z).
+
+===========================
+Implicit Argument Splitting
+===========================
+
+This is a (somewhat non-obvious) mechanism which allows for a single rule to generate
+multiple results from a single input path when a argument is a list of tokens.
+
+Consider the rule:
+
+//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
+  ; /timeData/$3/allowed   ; values=$1
+  ; /timeData/$3/preferred ; values=$2
+
+where the "regions" attributes (which is captured as '$3') contains a whitespace separated
+list of region codes (e.g. "US GB AU NZ"). In this case the rule is applied once for each
+region, producing paths such as "/timeData/US/allowed" or "/timeData/NZ/preferred". Note
+that there is no explicit instruction to do this, it just happens.
+
+The rule is that the first unquoted argument in the resource bundle path is always treated
+as splittable.
+
+To suppress this behaviour, the argument must be quoted (e.g. /timeData/"$3"/allowed). Now,
+if there were another following unquoted argument, that would become implicitly splittable
+(but only one argument is ever splittable).
+
+============
+Instructions
+============
+
+Additional instructions can be supplied to control value transformation and specify fallback
+values. The set of instructions is:
+* values:     The most common instruction which defines how values are transformed.
+* fallback:   Defines a fallback value to be used if this rule was not matched.
+
+There are two other special case instructions which should (if at all possible) not be used,
+and might be removed at some point:
+* group:      Causes values to be grouped as sub-arrays for very specific use cases
+              (prefer using "Hidden Labels" where possible).
+* base_xpath: Allows deduplication of results between multiple different rules (this is a
+              hack to work around limitations in how matching is performed).
+
+-------------------
+values=<expression>
+-------------------
+
+The "values" instruction defines an expression whose evaluated result becomes the output
+resource bundle value(s). Unless quoting is present, this evaluated expression is split
+on whitespace and can become multiple values in the resulting resource bundle.
+
+Examples:
+
+* values=$1 $2 $3
+
+  Produces three separate values in the resource bundle for the first three captured
+  arguments.
+
+* values="$1 $2" $3
+
+  Produces two values in the resource bundle, the first of which is two captured values
+  separated by a space character.
+
+* values={value}
+
+  Substitutes the CLDR value, but then performs whitespace splitting on the result. This
+  differs from the behaviour when no "values" instructions is present (which does not
+  split the results).
+
+* values="{value}" $1
+
+  Produces two values, the first of which is the unsplit CLDR value, and the second is a
+  captured argument.
+
+* values=&func($1, {value})
+
+  Invokes a transformation function, passing in a captured argument and the CLDR value,
+  and the result is then split. The set of functions available to a transformer is
+  configured when it is created.
+
+Note that in the above examples, it is assumed that the $N arguments do not contain spaces.
+If they did, it would result in more output values. To be strict about things, every value
+which should not be split must be quoted (e.g. values="$1" "$2" "$3") but since captured
+values are often IDs or other tokens, this is not what is seen in practice, so it is not
+reflected in these examples.
+
+---------------------
+fallback=<expression>
+---------------------
+
+The fallback instruction provides a way for default values to be emitted for a path that
+was not matched. Fallbacks are useful when several different rules produce values for the
+same resource bundle. In this case the output path produced by one rule can be used as
+the "key" for any unmatched rules with fallback values (to "fill in the gaps").
+
+Consider the two rules which can emit the same resource bundle path:
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
+     ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
+     ; /Currencies/$1 ; fallback=$1
+
+These rules, if both matched, will produce two values for the same resource bundle path.
+Consider the CLDR values:
+
+//ldml/numbers/currencies/currency[@type="USD"]/symbol      ==> "$"
+//ldml/numbers/currencies/currency[@type="USD"]/displayName ==> "US Dollar"
+
+After matching both of these paths, the values for the resource bundle "/Currencies/USD"
+will be the array { "$", "US Dollar" }.
+
+However, if only one value were present to be converted, the converter could use the
+matched path "/Currencies/XXX" and infer the missing fallback value, ensuring that the
+output array (it if was emitted at all) was always two values.
+
+Note that in order for this to work, the fallback value must be derivable only from the
+matched path. E.g. it cannot contain arguments that are not also present in the matched
+path, and obviously cannot reference the "{value}" at all. Thus the following would not
+be permitted:
+
+//ldml/foo/bar[@type="(%W)"][@region=(%A)] ; /Foo/$1 ; fallback=$2
+
+However the fallback value can reference existing CLDR or resource bundle paths (expected
+to be present from other rules). For example:
+  fallback=/weekData/001:intvector[0]
+or:
+  fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
+
+The latter case is especially complex because it also uses the "dynamic" argument:
+  %D=//ldml/numbers/defaultNumberingSystem
+
+So determining the resulting value will require:
+1) resolving "//ldml/numbers/defaultNumberingSystem" to, for example, "arab"
+2) looking up the value of "//ldml/numbers/symbols[@numberSystem="arab"]/decimal"
+
+-----------------
+base_xpath=<path>
+-----------------
+
+The base_xpath instruction allows a rule to specify a proxy path which is used in place of
+the originally matched path in the returned result. This is a useful hack for cases where
+values are derived from information in a path prefix.
+
+Because path matching for transformation happens only on full paths, it is possible that
+several distinct CLDR paths might effectively generate the same result if they share the
+same prefix (i.e. paths in the same "sub hierarchy" of the CLDR data).
+
+If this happens, then you end up generating "the same" result from different paths. To
+fix this, a "surrogate" CLDR path can be specified as a proxy for the source path,
+allowing several results to appears to have come from the same source, which results in
+deduplication of the final value.
+
+For example, the two rules :
+
+//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+Produce the same results for different paths (with or without the "officialStatus"
+attribute) but only one such result is desired. By specifying the same base_xpath on
+both rules, the conversion logic can deduplicate these to produce only one result.
+
+When using base_xpath, it is worth noting that:
+1) Base xpaths must be valid "distinguishing" paths (but are never matched to any rule).
+2) Base xpaths can use arguments to achieve the necessary level of uniqueness.
+3) Rules which share the same base xpath must always produce the same values.
+
+Note however that this is a still very much a hack because since two rules are responsible
+for generating the same result, there is no well defined "line number" to use for ordering
+of values. Thus this mechanism should only be used for rules which produce "single"
+values, and must not be used in cases where the ordering of values in arrays is important.
+
+This mechanism only exists because there is currently no mechanism for partial matching
+or a way to match one path against multiple rules.
+
+-----
+group
+-----
+
+The "group" instruction should be considered a "last resort" hack for controlling value
+grouping, in cases where "hidden labels" are not suitable (see below).
+
+==============================
+Value Arrays and Hidden Labels
+==============================
+
+In the simplest case, one rule produces one or more output path/values per matched CLDR
+value (i.e. one-to-one or one-to-many). If that happens, then output ordering of the
+resource bundle paths is just the natural resource bundle path ordering.
+
+However it is also possible for several rules to produce values for a single output path
+(i.e. many-to-one). When this happens there are some important details about how results
+are grouped and ordered.
+
+------------
+Value Arrays
+------------
+
+If several rules produce results for the same resource bundle path, the values produced
+by the rules are always ordered according to the order of the rule in the configuration
+rule (and it is best practice to group any such rules together for clarity).
+
+If each rule produces multiple values, then depending on grouping, those values can either
+be concatenated together in a single array or grouped individually to create an array
+of arrays.
+
+In the example below, there are four rules producing values for the same path (
+
+//.../firstDay[@day="(%W)"][@territories="(%W)"]     ; /weekData/$2:intvector ; values=&day_number($1)
+//.../minDays[@count="(%N)"][@territories="(%W)"]    ; /weekData/$2:intvector ; values=$1
+//.../weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0
+//.../weekendEnd[@day="(%W)"][@territories="(%W)"]   ; /weekData/$2:intvector ; values=&day_number($1) 86400000
+
+The first two rules produce one value each, and the last two produce two values each. This
+results in the resource bundle "/weekData/xxx:intvector" having a single array consisting
+of six values. In the real configuration, these rules also use fallback instructions to
+ensure that the resulting array of values is always six values, even if some CLDR paths are
+not present.
+
+-------------
+Hidden Labels
+-------------
+
+Sometimes rules should produce separate "sub-arrays" of values, rather than having all the
+values appended to a single array. Consider the following path/value pairs:
+
+x/y: a
+x/y: b
+x/y: c
+
+Which produce the resource bundle "x/y" with three values:
+
+x{
+  y{
+    "a",
+    "b",
+    "c"
+  }
+}
+
+Now suppose we want to make a resource bundle where the values are grouped into their
+own sub-array:
+
+x{
+  y{
+    { "a", "b", "c" }
+  }
+}
+
+We can think of this as coming from the path/value pairs:
+
+x/y/-: a
+x/y/-: b
+x/y/-: c
+
+where to represent the sub-array we introduce the idea of an empty path element '-'.
+
+In a transformation rule, these "empty elements" are represent as "hidden labels", and look
+like "<some-label>". They are treated as "normal" path elements for purposes of ordering and
+grouping, but are treated as empty when the paths are written to the ICU data files.
+
+For example the rule:
+
+//.../currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
+
+Generates a series of grouped, 2-element sub-arrays split by the captured type attribute.
+
+  codeMappingCurrency{
+    { type-1, numeric-1 }
+    { type-2, numeric-2 }
+    { type-3, numeric-3 }
+  }
+
+<FIFO> is a special hidden label which is substituted for in incrementing counting when
+sorting paths. It ensures that values in the same array are sorted in the order that they
+were encountered. However this mechanism imposes a strict requirement that the ordering
+of CLDR values to be transformed matches the expected ICU value order, so it should be
+avoided where possible to avoid this implicit, subtle dependency. Note that this mechanism
+is currently only enabled for the transformation of "supplemental data" and may eventually
+be removed.
+
+Hidden labels are a neat solution which permits the generation of sub-array values, but they
+don't quite work in every case. For example if you need to produce a resource bundle with a
+mix of values and sub-arrays, like:
+
+x{
+  y{
+    "a",
+    { "b", "c" }
+    "d"
+  }
+}
+
+which can be thought of as coming from the path/value pairs:
+
+x/y: a
+x/y/<z>: b
+x/y/<z>: c
+x/y: d
+
+we find that, after sorting the resource bundle paths, we end up with:
+
+x/y: a
+x/y: d
+x/y/<z>: b
+x/y/<z>: c
+
+which produces the wrong result. This happens because values with different paths are
+sorted primarily by their path. I cases like this, where a mix of values and sub-arrays
+are required, the "group" instruction can be used instead.
+
+For example:
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol      ; /Currencies/$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName ; /Currencies/$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/pattern     ; /Currencies/$1 ; group
+//ldml/numbers/currencies/currency[@type="(%W)"]/decimal     ; /Currencies/$1 ; group
+//ldml/numbers/currencies/currency[@type="(%W)"]/group       ; /Currencies/$1 ; group
+
+Produces resource bundles which look like:
+
+Currencies{
+  xxx{
+     "<symbol>",
+     "<display name>",
+     { "<pattern>", "<decimal>", "<group>" }
+  }
+}
diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_supplemental.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_supplemental.txt

new file mode 100644 (file)

index 0000000..be58239
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_supplemental.txt
@@ -0,0 +1,202 @@
+# ldml2icu_supplemental.txt
+#
+# © 2016 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+#
+# Used by SupplementalMapper.
+# Data-driven file for mapping supplemental LDML paths to ICU paths.
+# See ldml2icu_readme.txt for a detailed explanation of this file.
+
+# Attribute value
+%A=[^"']++
+# Attribute value, no underscore
+%B=[^"'_]++
+# Word/Zone match
+%W=[\s\w\-/]++
+# Greedy word match
+%G=[\s\w\-]+
+# Number match
+%N=[\d\.]++
+
+# supplementalData.xml
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$2
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"][@tender="false"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$2
+     ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+     ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"][@tender="false"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+     ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+     ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$4
+     ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+     ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"][@tender="false"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$4
+     ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+     ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
+     ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@to="(%W)"][@iso4217="(%W)"][@tender="false"]
+     ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+     ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($2, to)
+     ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashDigits="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $4 $5
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $4
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $3
+
+//supplementalData/calendarPreferenceData/calendarPreference[@territories="(%A)"][@ordering="(%A)"] ; /calendarPreferenceData/$1    ; values=$2
+//supplementalData/codeMappings/territoryCodes[@type="(%W)"][@numeric="(%N)"][@alpha3="(%W)"].* ; /codeMappings/<$1> ; values=$1 $2 $3
+
+//supplementalData/codeMappings/currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
+
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"][@alt="secondary"]
+     ; /languageData/$1/secondary/scripts ; values=$2
+     ; /languageData/$1/secondary/territories ; values=$3
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@alt="secondary"] ; /languageData/$1/secondary/scripts ; values=$2
+//supplementalData/languageData/language[@type="(%W)"][@territories="(%G)"][@alt="secondary"] ; /languageData/$1/secondary/territories ; values=$2
+
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
+     ; /languageData/$1/primary/scripts ; values=$2
+     ; /languageData/$1/primary/territories; values=$3
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"] ; /languageData/$1/primary/scripts ; values=$2
+//supplementalData/languageData/language[@type="(%W)"][@territories="(%W)"] ; /languageData/$1/primary/territories ; values=$2
+
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="deprecated"] ; /territoryContainment/deprecated/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="grouping"] ; /territoryContainment/containedGroupings/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@grouping="true"] ; /territoryContainment/grouping/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"]; /territoryContainment/$1 ; values=$2
+
+//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1 ; values=$2
+//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@subtype="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1-$2 ; values=$3
+
+//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"](?:[@references="(?:%A)"])?[@alt="(%A)"] ; /weekData%$3/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
+
+//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"]     ; /weekData/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
+//supplementalData/weekData/minDays[@count="(%N)"][@territories="(%W)"]    ; /weekData/$2:intvector ; values=$1 ; fallback=/weekData/001:intvector[1]
+//supplementalData/weekData/weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0 ; fallback=/weekData/001:intvector[2] /weekData/001:intvector[3]
+//supplementalData/weekData/weekendEnd[@day="(%W)"][@territories="(%W)"]   ; /weekData/$2:intvector ; values=&day_number($1) 86400000 ; fallback=/weekData/001:intvector[4] /weekData/001:intvector[5]
+
+//supplementalData/weekData/weekOfPreference[@locales="(%A)"][@ordering="(%A)"] ; /weekOfPreference/$1    ; values=$2
+
+//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
+     ; /timeData/$3/allowed ; values=$1
+     ; /timeData/$3/preferred ; values=$2
+
+//supplementalData/measurementData/measurementSystem[@type="metric"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=0
+//supplementalData/measurementData/measurementSystem[@type="US"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=1
+//supplementalData/measurementData/measurementSystem[@type="UK"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=2
+
+//supplementalData/measurementData/measurementSystem[@type="metric"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=0
+//supplementalData/measurementData/measurementSystem[@type="US"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=1
+//supplementalData/measurementData/measurementSystem[@type="UK"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=2
+//supplementalData/measurementData/paperSize[@type="A4"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=297 210
+//supplementalData/measurementData/paperSize[@type="US-Letter"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=279 216
+
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-small-informal
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2-small
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-informal
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+    ; /territoryInfo/$1/$5/officialStatus ; values=$8
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+    ; /territoryInfo/$1/$5/officialStatus ; values=$8
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
+    ; /territoryInfo/$1/$5/officialStatus ; values=$7
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"](?:[@references="%W"])?
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+    ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
+
+# This only exists right now for 'ZZ', which has no <languagePopulation> child elements.
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]
+    ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+//supplementalData/calendarData/calendar[@type="(%W)"]/calendarSystem[@type="(%W)"] ; /calendarData/$1/system ; values=$2
+//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"][@named="(%W)"]
+    ; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
+    ; /calendarData/$1/eras/$2/named ; values=$5
+//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"]
+    ; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
+
+# languageInfo.xml
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/paradigmLocales[@locales="(%A)"] ; /languageMatchingInfo/$1/paradigmLocales ; values=$2
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/matchVariable[@id="\$(%A)"][@value="(%A)"] ; /languageMatchingInfo/$1/matchVariable/$2 ; values=$3
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"][@oneway="true"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 1
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 0
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"][@oneway="true"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 1
+//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 0
+
+# likelySubtags.xml
+//supplementalData/likelySubtags/likelySubtag[@from="(%A)"][@to="(%A)"] ; /$1 ; values=$2
+
+# metaZones.xml - metaZones.txt
+//supplementalData/metaZones/mapTimezones[@type="metazones"]/mapZone[@type="(%A)"][@other="(%W)"][@territory="(%W)"] ; /mapTimezones/$2/$3 ; values=$1
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$2
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$3 "$2" "9999-12-31 23:59"
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$4 "$2" "$3"
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<1970-01-01 00:00> ; values=$3 "1970-01-01 00:00" "$2"
+
+//supplementalData/primaryZones/primaryZone[@iso3166="(%W)"] ; /primaryZones/$1 ; values={value}
+
+# numberingSystems.txt
+//supplementalData/numberingSystems/numberingSystem[@type="algorithmic"][@id="(%W)"][@rules="(%A)"]
+     ; /numberingSystems/$1/algorithmic:int ; values=1
+     ; /numberingSystems/$1/desc ; values=&algorithm($2)
+     ; /numberingSystems/$1/radix:int ; values=10
+
+//supplementalData/numberingSystems/numberingSystem[@type="numeric"][@id="(%W)"][@digits="(%A)"]
+     ; /numberingSystems/$1/algorithmic:int ; values=0
+     ; /numberingSystems/$1/desc ; values=$2
+     ; /numberingSystems/$1/radix:int ; values=10
+
+# windowsZones.txt
+//supplementalData/windowsZones/mapTimezones/mapZone[@type="(%A)"][@other="(%A)"][@territory="(%W)"] ; /mapTimezones/"$2"/$3 ; values="$1"
+
+# genderList.txt
+//supplementalData/gender/personList[@type="(%W)"][@locales="(%W)"] ; /genderList/$2 ; values=$1
+
+# locale info
+//supplementalData/parentLocales/parentLocale[@parent="(%A)"][@locales="(%A)"] ; /parentLocales/$1 ; values=$2
+
+# supplementalMetadata.xml (metadata.txt)
+//supplementalData/metadata/defaultContent[@locales="(%A)"] ; /defaultContent ; values=$1
+//supplementalData/metadata/alias/(language|script|territory|subdivision|variant)Alias[@type="(%A)"][@replacement="(%A)"][@reason="(%A)"]
+     ; /alias/$1/$2/reason ; values="$4"
+     ; /alias/$1/$2/replacement ; values="$3"
+
+# Region codes used by ICU's Region class
+# Specify the value explicitly so that the LDMLConverter will split it.
+//supplementalData/metadata/validity/variable[@type="choice"][@id="\$territory"] ; /regionCodes ; values={value}
+
+# validity
+//supplementalData/idValidity/id[@type="(%A)"][@idStatus="(%A)"] ; /idValidity/$1/$2 ; values={value}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java

new file mode 100644 (file)

index 0000000..dd81a2b
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java
@@ -0,0 +1,127 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth8.assertThat;
+import static org.junit.Assert.fail;
+import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrPath;
+
+@RunWith(JUnit4.class)
+public class PathMatcherTest {
+    @Test
+    public void testMatcher() {
+        CldrPath calEra = parseDistinguishingPath(
+            "//ldml/dates/calendars/calendar[@type=\"buddhist\"]/eras/eraAbbr/era[@type=\"0\"]");
+        CldrPath chineseMon1 = monthInfo("chinese", "format", "abbreviated", 1);
+        CldrPath chineseMon2 = monthInfo("chinese", "format", "abbreviated", 2);
+        CldrPath genericMon1 = monthInfo("generic", "stand-alone", "narrow", 1);
+        CldrPath genericMon2 = monthInfo("generic", "stand-alone", "narrow", 2);
+        List<CldrPath> calPaths =
+            Arrays.asList(calEra, chineseMon1, chineseMon2, genericMon1, genericMon2);
+
+        PathMatcher anyCalendarPaths = PathMatcher.of("ldml/dates/calendars/calendar");
+        assertThat(calPaths.stream().allMatch(anyCalendarPaths::matchesPrefixOf)).isTrue();
+        assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matches)).isTrue();
+        assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matchesSuffixOf)).isTrue();
+
+        PathMatcher chineseCalendars =
+            PathMatcher.of("ldml/dates/calendars/calendar[@type=\"chinese\"]");
+        assertThat(calPaths.stream().filter(chineseCalendars::matchesPrefixOf))
+            .containsExactly(chineseMon1, chineseMon2);
+
+        PathMatcher anyMonth = PathMatcher.of("monthWidth[@type=*]/month[@type=*]");
+        assertThat(calPaths.stream().filter(anyMonth::matchesSuffixOf))
+            .containsExactly(chineseMon1, chineseMon2, genericMon1, genericMon2);
+
+        PathMatcher narrowMonth = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
+        assertThat(calPaths.stream().filter(narrowMonth::matchesSuffixOf))
+            .containsExactly(genericMon1, genericMon2);
+        assertThat(calPaths.stream().filter(narrowMonth::matches)).isEmpty();
+
+        PathMatcher firstMonth = PathMatcher.of("month[@type=\"1\"]");
+        assertThat(calPaths.stream().filter(firstMonth::matchesSuffixOf))
+            .containsExactly(chineseMon1, genericMon1);
+
+        PathMatcher fullMatch = PathMatcher.of("ldml/dates"
+            + "/calendars/calendar[@type=\"generic\"]"
+            + "/months/monthContext[@type=\"stand-alone\"]"
+            + "/monthWidth[@type=\"narrow\"]"
+            + "/month[@type=\"2\"]");
+        assertThat(calPaths.stream().filter(fullMatch::matches)).containsExactly(genericMon2);
+    }
+
+    @Test
+    public void testWildcardSegment() {
+        PathMatcher wildcard = PathMatcher.of("ldml/dates"
+            + "/calendars/calendar[@type=\"generic\"]"
+            + "/*/*[@type=\"format\"]/*[@type=\"narrow\"]/*[@type=*]");
+
+        assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 1))).isTrue();
+        assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 9))).isTrue();
+        assertThat(wildcard.matches(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
+
+        assertThat(wildcard.matches(monthInfo("chinese", "format", "narrow", 1))).isFalse();
+        assertThat(wildcard.matches(monthInfo("generic", "stand-alone", "narrow", 1))).isFalse();
+        assertThat(wildcard.matches(dayInfo("generic", "format", "wide", "mon"))).isFalse();
+    }
+
+    @Test
+    public void testAnyOf() {
+        PathMatcher monthMatch = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
+        PathMatcher dayMatch = PathMatcher.of("dayWidth[@type=\"narrow\"]/day[@type=*]");
+        PathMatcher combined = PathMatcher.anyOf(monthMatch, dayMatch);
+
+        assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "narrow", 1))).isTrue();
+        assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
+
+        assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "wide", 1))).isFalse();
+        assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "wide", "mon"))).isFalse();
+    }
+
+    @Test
+    public void testBadSpecifiers() {
+        assertInvalidPathSpecification("");
+        // Leading and trailing '/' are not permitted (they imply empty segments.
+        assertInvalidPathSpecification("/foo/");
+        assertInvalidPathSpecification("foo//bar");
+        assertInvalidPathSpecification("foo/bad segment name");
+        assertInvalidPathSpecification("foo/bar[type=*]");
+        assertInvalidPathSpecification("foo/bar[@type=**]");
+        assertInvalidPathSpecification("foo/bar[@type='double-quotes-only']");
+    }
+
+    private void assertInvalidPathSpecification(String spec) {
+        IllegalArgumentException e =
+            assertThrows(IllegalArgumentException.class, () -> PathMatcher.of(spec));
+        assertThat(e).hasMessageThat().startsWith("invalid path specification");
+        assertThat(e).hasMessageThat().contains(spec);
+    }
+
+    private static CldrPath monthInfo(String type, String context, String width, int number) {
+        return CldrPath.parseDistinguishingPath(String.format(
+            "//ldml/dates/calendars/calendar[@type=\"%s\"]"
+                + "/months/monthContext[@type=\"%s\"]"
+                + "/monthWidth[@type=\"%s\"]"
+                + "/month[@type=\"%d\"]",
+            type, context, width, number));
+    }
+
+    private static CldrPath dayInfo(String type, String context, String width, String id) {
+        return CldrPath.parseDistinguishingPath(String.format(
+            "//ldml/dates/calendars/calendar[@type=\"%s\"]"
+                + "/days/dayContext[@type=\"%s\"]"
+                + "/dayWidth[@type=\"%s\"]"
+                + "/day[@type=\"%s\"]",
+            type, context, width, id));
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/RbPathTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/RbPathTest.java

new file mode 100644 (file)

index 0000000..d7aaf80
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/RbPathTest.java
@@ -0,0 +1,44 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static org.unicode.icu.tool.cldrtoicu.testing.RbPathSubjectFactory.assertThat;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth8.assertThat;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class RbPathTest {
+    @Test
+    public void testEmpty() {
+        assertThat(RbPath.empty()).hasSegments();
+        assertThat(RbPath.empty()).hasLength(0);
+    }
+
+    @Test
+    public void testParseVsOf() {
+        assertThat(RbPath.of("foo", "bar")).hasSegments("foo", "bar");
+        assertThat(RbPath.of("foo/bar")).hasSegments("foo/bar");
+        assertThat(RbPath.parse("foo/bar")).hasSegments("foo", "bar");
+    }
+
+    @Test
+    public void testBadArgs() {
+        assertBadPath("", "empty path string");
+        assertBadPath("foo//bar", "empty path segment");
+        assertBadPath("foo/<bar/baz", "mismatched quoting");
+        assertBadPath("foo/\"bar", "mismatched quoting");
+        assertBadPath("foo/\"bar\"baz\"", "invalid character");
+        assertBadPath("foo/bar baz", "invalid character");
+    }
+
+    private static void assertBadPath(String path, String errorSnippet) {
+        IllegalArgumentException e =
+            assertThrows(IllegalArgumentException.class, () -> RbPath.parse(path));
+        assertThat(e).hasMessageThat().contains(errorSnippet);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java

new file mode 100644 (file)

index 0000000..af10861
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java
@@ -0,0 +1,357 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth.assertWithMessage;
+import static com.google.common.truth.Truth8.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+import static org.unicode.cldr.api.CldrValue.parseValue;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+import org.unicode.cldr.tool.LikelySubtags;
+import org.unicode.cldr.util.LanguageTagCanonicalizer;
+import org.unicode.cldr.util.LocaleIDParser;
+import org.unicode.cldr.util.SupplementalDataInfo;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * Unit tests for the supplemental data API. These tests either use fake data for unit testing, or
+ * compare behaviour between this API and the equivalent CLDR utility tool for regression testing.
+ */
+@RunWith(JUnit4.class)
+public class SupplementalDataTest {
+    private static SupplementalData regressionData;
+    private static LikelySubtags likelySubtags;
+
+    @BeforeClass
+    public static void loadRegressionData() {
+        Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
+        regressionData = SupplementalData
+            .create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
+        SupplementalDataInfo sdi =
+            SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
+        likelySubtags = new LikelySubtags(sdi);
+    }
+
+    @Test
+    public void testGetParent_explicit() {
+        // Locales with an explicit (non truncation) parent (a.k.a "English is weird").
+        SupplementalData fakeData = fakeSupplementalData(parentLocales("en_001", "en_AU", "en_GB"));
+
+        assertThat(fakeData.getExplicitParentLocaleOf("en_GB")).hasValue("en_001");
+        assertThat(fakeData.getExplicitParentLocaleOf("en_AU")).hasValue("en_001");
+        assertThat(fakeData.getExplicitParentLocaleOf("en_US")).isEmpty();
+        assertThat(fakeData.getExplicitParentLocaleOf("en")).isEmpty();
+
+        assertThat(fakeData.getParent("en_GB")).isEqualTo("en_001");
+        assertThat(fakeData.getParent("en_AU")).isEqualTo("en_001");
+        assertThat(fakeData.getParent("en_001")).isEqualTo("en");
+        assertThat(fakeData.getParent("en_US")).isEqualTo("en");
+        assertThat(fakeData.getParent("en")).isEqualTo("root");
+
+    }
+
+    @Test
+    public void testGetParent_likelyScript() {
+        // To figure out default scripts we use likely subtags.
+        SupplementalData fakeData = fakeSupplementalData(likelySubtag("zh", "zh_Hans_CN"));
+
+        // When removing a non-default script, the parent become "root".
+        assertThat(fakeData.getParent("zh_Hant")).isEqualTo("root");
+        // "Hans" is recognized as the default script, so the parent is obtained via truncation.
+        assertThat(fakeData.getParent("zh_Hans")).isEqualTo("zh");
+    }
+
+    @Test
+    public void testMaximize() {
+        SupplementalData fakeData = fakeSupplementalData(
+            likelySubtag("en", "en_Latn_US"),
+            likelySubtag("pt", "pt_Latn_BR"),
+            likelySubtag("und", "en_Latn_US"));
+
+        // You cannot maximize "root".
+        assertThat(fakeData.maximize("root")).isEmpty();
+        // Existing subtags preserved.
+        assertThat(fakeData.maximize("en")).hasValue("en_Latn_US");
+        assertThat(fakeData.maximize("en_GB")).hasValue("en_Latn_GB");
+        assertThat(fakeData.maximize("en_VARIANT")).hasValue("en_Latn_US_VARIANT");
+        // Some other similar examples.
+        assertThat(fakeData.maximize("pt")).hasValue("pt_Latn_BR");
+        assertThat(fakeData.maximize("pt_PT")).hasValue("pt_Latn_PT");
+        assertThat(fakeData.maximize("und")).hasValue("en_Latn_US");
+    }
+
+    @Test
+    public void testReplaceDeprecatedTags_iAmRoot() {
+        SupplementalData fakeData = fakeSupplementalData();
+        assertThat(fakeData.replaceDeprecatedTags("root")).isEqualTo("root");
+    }
+
+    @Test
+    public void testReplaceDeprecatedTags_sameSubtags() {
+        SupplementalData fakeData = fakeSupplementalData(likelySubtag("en", "en_Latn_US"));
+
+        // Replacement does not minimize or maximize results (even though "Latn" is likely).
+        assertThat(fakeData.replaceDeprecatedTags("en_Latn_GB")).isEqualTo("en_Latn_GB");
+        assertThat(fakeData.replaceDeprecatedTags("en_GB")).isEqualTo("en_GB");
+    }
+
+    @Test
+    public void testReplaceDeprecatedTags_subtagReplacement() {
+        SupplementalData fakeData = fakeSupplementalData(
+            languageAlias("cym", "cy"),
+            scriptAlias("Qaai", "Zinh"),
+            territoryAlias("YU", "RS"));
+
+        // Region is deprecated
+        assertThat(fakeData.replaceDeprecatedTags("en_YU")).isEqualTo("en_RS");
+        // Script is deprecated
+        assertThat(fakeData.replaceDeprecatedTags("ar_Qaai_IR")).isEqualTo("ar_Zinh_IR");
+        // Language is deprecated
+        assertThat(fakeData.replaceDeprecatedTags("cym_GB")).isEqualTo("cy_GB");
+    }
+
+    @Test
+    public void testReplaceDeprecatedTags_complex() {
+        SupplementalData fakeData = fakeSupplementalData(
+            languageAlias("sh", "sr_Latn"),
+            languageAlias("zh_TW", "zh_Hant_TW"),
+            languageAlias("tzm_Latn_MA", "tzm_MA"),
+            territoryAlias("YU", "RS"),
+            likelySubtag("sr", "sr_Cyrl_RS"),
+            likelySubtag("zh_Hant", "zh_Hant_TW"));
+
+        // "sh" -> "sr_Latn", taking precedence over the fact that "sr" maximizes to "sr_Cyrl_RS".
+        assertThat(fakeData.replaceDeprecatedTags("sh_YU")).isEqualTo("sr_Latn_RS");
+        // Alias lookup can add tags however depending on the situation.
+        assertThat(fakeData.replaceDeprecatedTags("zh_TW")).isEqualTo("zh_Hant_TW");
+        // But it will NOT remove tags (even though the languageAlias table contains an entry from
+        // "tzm_Latn_MA" to "tzm_MA").
+        assertThat(fakeData.replaceDeprecatedTags("tzm_Latn_MA")).isEqualTo("tzm_Latn_MA");
+    }
+
+    @Test
+    public void testGetDefaultCalendar() {
+        SupplementalData fakeData = fakeSupplementalData(
+            defaultCalendar("gregorian", "001"),
+            defaultCalendar("persian", "AF"),
+            likelySubtag("uz", "uz_Latn_UZ"),
+            likelySubtag("uz_AF", "uz_Arab_AF"),
+            likelySubtag("uz_Arab", "uz_Arab_AF"));
+        assertThat(fakeData.getDefaultCalendar("root")).hasValue("gregorian");
+        // Empty because "gregorian" is the default found in the parent locale.
+        assertThat(fakeData.getDefaultCalendar("en_US")).isEmpty();
+        assertThat(fakeData.getDefaultCalendar("uz")).isEmpty();
+        assertThat(fakeData.getDefaultCalendar("uz_AF")).hasValue("persian");
+        assertThat(fakeData.getDefaultCalendar("uz_Arab")).hasValue("persian");
+        // Empty because "uz_Arab" defines the persian calendar.
+        assertThat(fakeData.getDefaultCalendar("uz_Arab_AF")).isEmpty();
+    }
+
+    @Test
+    public void testGetDefaultCalendar_secretHacks() {
+        SupplementalData fakeData = fakeSupplementalData(
+            defaultCalendar("gregorian", "001"),
+            likelySubtag("ja", "ja_Jpan_JP"),
+            likelySubtag("th", "th_Thai_TH"));
+        // Empty because "gregorian" is the default found in the parent locale.
+        assertThat(fakeData.getDefaultCalendar("ja_US")).isEmpty();
+        assertThat(fakeData.getDefaultCalendar("ja")).isEmpty();
+
+        // Traditional calendars for a region cannot be represented via the territory-only based
+        // CLDR data calendar mapping, so they exist as hard coded "hacks" in SupplementalData.
+        // They could be pulled out into the configuration API, but they should ideally just be
+        // derived from CLDR data directly.
+        assertThat(fakeData.getDefaultCalendar("ja_JP_TRADITIONAL")).hasValue("japanese");
+        assertThat(fakeData.getDefaultCalendar("ja_TRADITIONAL")).hasValue("japanese");
+        assertThat(fakeData.getDefaultCalendar("th_TH_TRADITIONAL")).hasValue("buddhist");
+        assertThat(fakeData.getDefaultCalendar("th_TRADITIONAL")).hasValue("buddhist");
+    }
+
+    @Test
+    public void testGetParent_regression() {
+        for (String id : TEST_LOCALE_IDS) {
+            assertWithMessage("id=%s", id)
+                .that(getIdChain(id, regressionData::getParent))
+                .isEqualTo(getIdChain(id, LocaleIDParser::getParent));
+        }
+    }
+
+    @Test
+    public void testMaximize_regression() {
+        for (String id : TEST_LOCALE_IDS) {
+            assertWithMessage("id=%s", id)
+                .that(regressionData.maximize(id).orElse(null))
+                .isEqualTo(likelySubtags.maximize(id));
+        }
+
+        // ars currently a special case since it's in the ICU data as an alias, but not in the CLDR
+        // data at all. This while it's a structurally valid language code, it cannot be maximized.
+        assertThat(regressionData.maximize("ars")).isEmpty();
+    }
+
+    @Test
+    public void testReplaceDeprecatedTags_regression() {
+        LanguageTagCanonicalizer ltc = new LanguageTagCanonicalizer();
+        for (String id : TEST_LOCALE_IDS) {
+            // Work around:
+            // https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13194
+            try {
+                ltc.transform(id);
+            } catch (NullPointerException e) {
+                System.out.println("--> " + id);
+                continue;
+            }
+            // Need to maximize to work around:
+            // https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13196
+            assertWithMessage("id=%s", id)
+                .that(regressionData.maximize(regressionData.replaceDeprecatedTags(id)).orElse(null))
+                .isEqualTo(likelySubtags.maximize(ltc.transform(id)));
+        }
+    }
+
+    private static Iterable<String> getIdChain(String id, Function<String, String> fn) {
+        List<String> chain = new ArrayList<>();
+        while (!id.equals("root")) {
+            chain.add(id);
+            id = fn.apply(id);
+        }
+        chain.add(id);
+        return chain;
+    }
+
+    private static final ImmutableSet<String> TEST_LOCALE_IDS = ImmutableSet.of(
+        "af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
+        "ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ", "ar_JO",
+        "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS", "ar_QA", "ar_SA",
+        "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars", "as", "as_IN",
+        "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ", "az_Latn",
+        "az_Latn_AZ", "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg",
+        "bg_BG", "bm", "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR",
+        "brx", "brx_IN", "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA", "ca",
+        "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU", "ceb",
+        "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs", "cs_CZ", "cy",
+        "cy_GB", "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
+        "de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
+        "dyo_SN", "dz", "dz_BT", "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR",
+        "en", "en_001", "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB",
+        "en_BE", "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
+        "en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI", "en_FJ",
+        "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM", "en_GU", "en_GY",
+        "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE", "en_JM", "en_KE", "en_KI",
+        "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG", "en_MH", "en_MO", "en_MP", "en_MS",
+        "en_MT", "en_MU", "en_MW", "en_MY", "en_NA", "en_NF", "en_NG", "en_NL", "en_NR", "en_NU",
+        "en_NZ", "en_PG", "en_PH", "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB",
+        "en_SC", "en_SD", "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ",
+        "en_TC", "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US",
+        "en_US_POSIX", "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
+        "eo_001", "es", "es_003", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
+        "es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN", "es_IC",
+        "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV", "es_US", "es_UY",
+        "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM", "fa", "fa_AF", "fa_IR", "ff",
+        "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM", "ff_Latn_GH", "ff_Latn_GM",
+        "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR", "ff_Latn_NE", "ff_Latn_NG",
+        "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi", "fi_FI", "fil", "fil_PH", "fo", "fo_DK",
+        "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI", "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF",
+        "fr_CG", "fr_CH", "fr_CI", "fr_CM", "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN",
+        "fr_GP", "fr_GQ", "fr_HT", "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML",
+        "fr_MQ", "fr_MR", "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC",
+        "fr_SN", "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
+        "fy", "fy_NL", "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR",
+        "gsw_LI", "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM", "ha", "ha_GH", "ha_NE", "ha_NG",
+        "haw", "haw_US", "he", "he_IL", "hi", "hi_IN", "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE",
+        "hu", "hu_HU", "hy", "hy_AM", "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN",
+        "in", "in_ID", "is", "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL", "ja",
+        "ja_JP", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID", "ka", "ka_GE", "kab", "kab_DZ",
+        "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV", "khq", "khq_ML", "ki", "ki_KE", "kk",
+        "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln", "kln_KE", "km", "km_KH", "kn", "kn_IN",
+        "ko", "ko_KP", "ko_KR", "kok", "kok_IN", "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM",
+        "ksh", "ksh_DE", "ku", "ku_TR", "kw", "kw_GB", "ky", "ky_KG", "lag", "lag_TZ", "lb",
+        "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO", "ln_CD", "ln_CF", "ln_CG", "lo",
+        "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT", "lu", "lu_CD", "luo", "luo_KE", "luy",
+        "luy_KE", "lv", "lv_LV", "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg",
+        "mg_MG", "mgh", "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN",
+        "mn", "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
+        "mua_CM", "my", "my_MM", "mzn", "mzn_IR", "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd",
+        "nd_ZW", "nds", "nds_DE", "nds_NL", "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ",
+        "nl_CW", "nl_NL", "nl_SR", "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no",
+        "no_NO", "nus", "nus_SS", "nyn", "nyn_UG", "om", "om_ET", "om_KE", "or", "or_IN", "os",
+        "os_GE", "os_RU", "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK",
+        "pl", "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
+        "pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL", "qu", "qu_BO", "qu_EC",
+        "qu_PE", "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
+        "ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ", "sah",
+        "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI", "se_NO", "se_SE",
+        "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA", "sh_CS", "sh_YU", "shi",
+        "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA", "shi_MA", "si", "si_LK", "sk",
+        "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn", "sn_ZW", "so", "so_DJ", "so_ET", "so_KE",
+        "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK", "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME",
+        "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK", "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA",
+        "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS", "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME",
+        "sr_RS", "sr_CS", "sr_YU", "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ",
+        "sw_UG", "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
+        "tg", "tg_TJ", "th", "th_TH", "ti", "ti_ER", "ti_ET", "tk", "tk_TM", "tl", "tl_PH", "to",
+        "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU", "twq", "twq_NE", "tzm", "tzm_MA", "ug",
+        "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab", "uz_Arab_AF",
+        "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ", "vai", "vai_Latn", "vai_Latn_LR",
+        "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi", "vi_VN", "vun", "vun_TZ", "wae", "wae_CH", "wo",
+        "wo_SN", "xh", "xh_ZA", "xog", "xog_UG", "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ",
+        "yo_NG", "yue", "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK", "zgh", "zgh_MA", "zh",
+        "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO", "zh_Hans_SG", "zh_Hant", "zh_Hant_HK",
+        "zh_Hant_MO", "zh_Hant_TW", "zh_CN", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
+
+    private static CldrValue parentLocales(String parent, String... locales) {
+        return supplementalData(
+            "parentLocales/parentLocale[@parent=\"%s\"][@locales=\"%s\"]",
+            parent, Joiner.on(' ').join(locales));
+    }
+
+    private static CldrValue defaultCalendar(String calendar, String... territories) {
+        return supplementalData(
+            "calendarPreferenceData/calendarPreference[@territories=\"%s\"][@ordering=\"%s\"]",
+            Joiner.on(' ').join(territories), calendar);
+    }
+
+    private static CldrValue likelySubtag(String from, String to) {
+        return supplementalData(
+            "likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]", from, to);
+    }
+
+    private static CldrValue languageAlias(String type, String replacement) {
+        return supplementalData(
+            "metadata/alias/languageAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+    }
+
+    private static CldrValue scriptAlias(String type, String replacement) {
+        return supplementalData(
+            "metadata/alias/scriptAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+    }
+
+    private static CldrValue territoryAlias(String type, String replacement) {
+        return supplementalData(
+            "metadata/alias/territoryAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+    }
+
+    private static CldrValue supplementalData(String path, Object... args) {
+        return parseValue(String.format("//supplementalData/" + path, args), "");
+    }
+
+    private static SupplementalData fakeSupplementalData(CldrValue... values) {
+        return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformerTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformerTest.java

new file mode 100644 (file)

index 0000000..1a66a2a
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformerTest.java
@@ -0,0 +1,538 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.truth.Truth.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+import static org.unicode.icu.tool.cldrtoicu.testing.ResultSubjectFactory.assertThat;
+
+import java.util.List;
+
+import javax.annotation.concurrent.Immutable;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+/**
+ * Tests for the regex transformer class. Note that in most cases, the rules used here are taken
+ * directly from one of the config files, simply because it avoids having to invent valid paths
+ * for testing (and we still need "real" CLDR paths since the path parsing verifies attributes
+ * against the DTD metadata). Basing tests on real rules illustrates that all of these tests are
+ * asserting about relied-upon behaviour, however there is nothing inherently special about these
+ * paths.
+ */
+@RunWith(JUnit4.class)
+public class RegexTransformerTest {
+    @Test
+    public void testSingleResults_singleCapture() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "%W=[\\w\\-]++",
+            "//ldml/numbers/defaultNumberingSystem[@alt=\"(%A)\"] ; /NumberElements/default_$1",
+            "//ldml/numbers/defaultNumberingSystem                ; /NumberElements/default",
+            "//ldml/numbers/otherNumberingSystems/(%W)            ; /NumberElements/$1");
+
+        CldrValue defaultNumberingSystem =
+            CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem", "foobar");
+        assertSingleResult(
+            transformer.transform(defaultNumberingSystem), "NumberElements/default", "foobar");
+
+        CldrValue altNumberingSystem =
+            CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem[@alt=\"foo\"]", "bar");
+        assertSingleResult(
+            transformer.transform(altNumberingSystem), "NumberElements/default_foo", "bar");
+
+        CldrValue otherNumberingSystems =
+            CldrValue.parseValue("//ldml/numbers/otherNumberingSystems/finance", "foo bar");
+        assertSingleResult(
+            transformer.transform(otherNumberingSystems), "NumberElements/finance", "foo bar");
+    }
+
+    @Test
+    public void testSingleResults_multipleCapture() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "//ldml/characters"
+                + "/parseLenients[@scope=\"(%A)\"][@level=\"(%A)\"]"
+                + "/parseLenient[@sample=\"%A\"]"
+                + " ; /parse/$1/$2");
+
+        CldrValue lenient = CldrValue.parseValue(
+            "//ldml/characters"
+                + "/parseLenients[@scope=\"general\"][@level=\"lenient\"]"
+                + "/parseLenient[@sample=\"ignored\"]",
+            "foo");
+        assertSingleResult(
+            transformer.transform(lenient), "/parse/general/lenient", "foo");
+
+        CldrValue stricter = CldrValue.parseValue(
+            "//ldml/characters"
+                + "/parseLenients[@scope=\"number\"][@level=\"stricter\"]"
+                + "/parseLenient[@sample=\"ignored\"]",
+            "bar");
+        assertSingleResult(
+            transformer.transform(stricter), "/parse/number/stricter", "bar");
+    }
+
+    @Test
+    public void testMultipleResults() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"numeric\"][@id=\"(%W)\"][@digits=\"(%A)\"]",
+            " ; /numberingSystems/$1/algorithmic:int ; values=0",
+            " ; /numberingSystems/$1/desc ; values=$2",
+            " ; /numberingSystems/$1/radix:int ; values=10");
+
+        CldrValue value = CldrValue.parseValue(
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"numeric\"][@id=\"foo\"][@digits=\"bar\"]",
+            "");
+        ImmutableList<Result> results = transformer.transform(value);
+        assertThat(results).hasSize(3);
+        assertThat(results.get(0)).hasKey("/numberingSystems/foo/algorithmic:int");
+        assertThat(results.get(0)).hasValues("0");
+        assertThat(results.get(0)).isGrouped(false);
+
+        assertThat(results.get(1)).hasKey("/numberingSystems/foo/desc");
+        assertThat(results.get(1)).hasValues("bar");
+        assertThat(results.get(1)).isGrouped(false);
+
+        assertThat(results.get(2)).hasKey("/numberingSystems/foo/radix:int");
+        assertThat(results.get(2)).hasValues("10");
+        assertThat(results.get(2)).isGrouped(false);
+    }
+
+    @Test
+    public void testImplicitArgumentSplitting() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/gender/personList[@type=\"(%W)\"][@locales=\"(%W)\"]"
+                + " ; /genderList/$2 ; values=$1",
+            "//supplementalData/windowsZones/mapTimezones"
+                + "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+                + " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
+
+        // Implicit splitting is based on the first unquoted placeholder in the output path ($2 in
+        // this case) and not the first captured group of the input path.
+        CldrValue personList = CldrValue.parseValue(
+            "//supplementalData/gender/personList[@type=\"neutral\"][@locales=\"xx yy zz\"]", "");
+        ImmutableList<Result> results = transformer.transform(personList);
+        assertThat(results).hasSize(3);
+        assertThat(results.get(0)).hasKey("/genderList/xx");
+        assertThat(results.get(0)).hasValues("neutral");
+        assertThat(results.get(1)).hasKey("/genderList/yy");
+        assertThat(results.get(1)).hasValues("neutral");
+        assertThat(results.get(2)).hasKey("/genderList/zz");
+        assertThat(results.get(2)).hasValues("neutral");
+
+        // Quoting prevents the first captured argument with spaces from triggering multiple
+        // results (it will trigger on the first un-quoted argument in the output path). This
+        // quoting must appear in the output however since spaces are "structural" in paths in
+        // ICU data files.
+        CldrValue mapZone = CldrValue.parseValue(
+            "//supplementalData/windowsZones/mapTimezones/mapZone"
+                + "[@type=\"foo\"]"
+                + "[@other=\"not split\"]"
+                + "[@territory=\"XX YY ZZ\"]",
+            "");
+        results = transformer.transform(mapZone);
+        assertThat(results).hasSize(3);
+        assertThat(results.get(0)).hasKey("/mapTimezones/\"not split\"/XX");
+        assertThat(results.get(2)).hasValues("foo");
+        assertThat(results.get(1)).hasKey("/mapTimezones/\"not split\"/YY");
+        assertThat(results.get(2)).hasValues("foo");
+        assertThat(results.get(2)).hasKey("/mapTimezones/\"not split\"/ZZ");
+        assertThat(results.get(2)).hasValues("foo");
+    }
+
+    @Test
+    public void testValueSplitting() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/parentLocales/parentLocale[@parent=\"(%A)\"][@locales=\"(%A)\"]"
+                + " ; /parentLocales/$1 ; values=$2",
+            "//supplementalData/windowsZones/mapTimezones"
+                + "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+                + " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
+
+        // Because the value is expressed via an explicit values instruction, it is split by space.
+        CldrValue parentLocale = CldrValue.parseValue(
+            "//supplementalData/parentLocales"
+                + "/parentLocale[@parent=\"foo\"][@locales=\"value is split\"]",
+            "");
+        assertSingleResult(transformer.transform(parentLocale),
+            "/parentLocales/foo", "value", "is", "split");
+
+        // However if a placeholder is quoted in the value instruction, it is not split.
+        CldrValue mapZone = CldrValue.parseValue(
+            "//supplementalData/windowsZones/mapTimezones/mapZone"
+                + "[@type=\"value is not split\"]"
+                + "[@other=\"foo\"]"
+                + "[@territory=\"XX\"]",
+            "");
+        assertSingleResult(transformer.transform(mapZone),
+            "/mapTimezones/\"foo\"/XX", "value is not split");
+    }
+
+    @Test
+    public void testResultFunctionCalling() {
+        List<String> configLines = asList(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
+            " ; /numberingSystems/foo ; values=&swap( $1 , $2 ) $3",
+            " ; /numberingSystems/bar ; values=\"&swap( $1, quux )\"",
+            " ; /numberingSystems/baz ; values=\"&swap( $1-$2, $3{value} )\"");
+
+        CldrValue numberingSystem = CldrValue.parseValue(
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
+            "-VALUE");
+
+        // Note that joining with a space is rather a trivial function, but it does illustrate that
+        // a function's output is still subject to value splitting unless quoted. In fact a common
+        // function (&ymd) is used to split year/month/day strings using spaces exactly so they are
+        // treated as separate values.
+        // Note also that the spaces around the arguments to the function are ignored however.
+        NamedFunction swapFn =
+            NamedFunction.create("swap", 2, args -> args.get(1) + " " + args.get(0));
+        PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, swapFn);
+        ImmutableList<Result> results = transformer.transform(numberingSystem);
+
+        assertThat(results).hasSize(3);
+        assertThat(results.get(0)).hasValues("bar", "foo", "baz");
+        assertThat(results.get(1)).hasValues("quux foo");
+        assertThat(results.get(2)).hasValues("baz-VALUE foo-bar");
+    }
+
+    @Test
+    public void testResultFunctionCalling_edgeCases() {
+        List<String> configLines = asList(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
+            " ; /numberingSystems/foo ; values=\"&join( {value} , $1 $2 $3, {value} )\"");
+
+        // This illustrates a fundamental problem with the way that quoting and splitting is
+        // defined in this config language. Splitting is always down after value substitution,
+        // which is just done as a single pass. This, if a value has a double-quote in it can
+        // upset the quoting behaviour in odd ways. Here it prevents the outermost quoting from
+        // working and results in multiple values where there should be one.
+        //
+        // To fix this, the implicit splitting should be replaced by a "split()" function and the
+        // rules should be parsed into something approximating a proper expression AST.
+        CldrValue badValue = CldrValue.parseValue(
+            "//supplementalData/numberingSystems"
+                + "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
+            "<< \" >>");
+
+        NamedFunction joinFn =
+            NamedFunction.create("join", 3, args -> args.get(0) + args.get(1) + args.get(2));
+        PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, joinFn);
+        ImmutableList<Result> results = transformer.transform(badValue);
+        // If outer quoting worked, this would be a single value, not five.
+        assertSingleResult(results, "/numberingSystems/foo", "<< ", ">>foo", "bar", "baz<<", " >>");
+    }
+
+    @Test
+    public void testDynamicVars() {
+        PathValueTransformer transformer = transformer(
+            "%W=[\\w\\-]++",
+            "%D=//ldml/numbers/defaultNumberingSystem",
+            "//ldml/numbers/currencyFormats[@numberSystem=\"%D\"]/currencySpacing/(%W)/(%W)",
+            " ; /currencySpacing/$1/$2");
+        CldrValue cldrValue = CldrValue.parseValue(
+            "//ldml/numbers/currencyFormats[@numberSystem=\"latn\"]"
+                + "/currencySpacing/beforeCurrency/currencyMatch",
+            "format");
+        // The path we expect to be resolved by the dynamic variable function.
+        CldrPath expectedPath =
+            CldrPath.parseDistinguishingPath("//ldml/numbers/defaultNumberingSystem");
+        ImmutableList<Result> format = transformer.transform(cldrValue, p -> {
+            assertThat(p).isEqualTo(expectedPath);
+            return "latn";
+        });
+        assertSingleResult(format, "/currencySpacing/beforeCurrency/currencyMatch", "format");
+    }
+
+    @Test
+    public void testFallbacks_simple() {
+        PathValueTransformer transformer = transformer(
+            "%W=[\\w\\-/]++",
+            "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol"
+                + " ; /Currencies/$1 ; fallback=$1",
+            "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/displayName"
+                + " ; /Currencies/$1 ; fallback=$1");
+
+        ImmutableList<Result> symbol = transformer.transform(
+            CldrValue.parseValue(
+                "//ldml/numbers/currencies/currency[@type=\"Foo\"]/symbol", "symbol"));
+        assertSingleResult(symbol, "Currencies/Foo", "symbol");
+        ImmutableList<Result> name = transformer.transform(
+            CldrValue.parseValue(
+                "//ldml/numbers/currencies/currency[@type=\"Foo\"]/displayName", "name"));
+        assertSingleResult(name, "Currencies/Foo", "name");
+
+        RbPath rbPath = RbPath.of("Currencies", "Foo");
+        ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+        assertThat(fallbacks).hasSize(2);
+
+        // Both fallbacks look like they are equal, but they didn't come from the same rule...
+        assertThat(fallbacks.get(0)).hasKey(rbPath);
+        assertThat(fallbacks.get(0)).hasValues("Foo");
+        assertThat(fallbacks.get(1)).hasKey(rbPath);
+        assertThat(fallbacks.get(1)).hasValues("Foo");
+
+        // ... so they correspond to different matched results.
+        assertThat(fallbacks.get(0).isFallbackFor(symbol.get(0))).isTrue();
+        assertThat(fallbacks.get(1).isFallbackFor(symbol.get(0))).isFalse();
+
+        assertThat(fallbacks.get(0).isFallbackFor(name.get(0))).isFalse();
+        assertThat(fallbacks.get(1).isFallbackFor(name.get(0))).isTrue();
+
+        // And they are ordered by their appearance in the configuration file.
+        assertThat(fallbacks.get(0)).isLessThan(fallbacks.get(1));
+
+        // BUT (and this is important) the fallback results are "equal". This is necessary for
+        // other situations where results are generated from different rules but should be
+        // considered "equal" for purposes of deduplication. Deduplication doesn't affect this
+        // situation though (but it's worth being explicit in this test). This is all a bit subtle
+        // and should be fixed properly at some point. See also "testBaseXpath()".
+        assertThat(fallbacks.get(0)).isEqualTo(fallbacks.get(1));
+    }
+
+    @Test
+    public void testFallbacks_multipleArgs() {
+        PathValueTransformer transformer = transformer(
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/calendarData"
+                + "/calendar[@type=\"(%W)\"]/eras/era[@type=\"(%W)\"][@(start|end)=\"(%A)\"]",
+            " ; /fake/$2/$4/$1/$3 ; fallback=$1 $2 $3 $4 $3 $2 $1");
+        // Path elements match the $N indices so it's easy to see how reordering happens.
+        RbPath rbPath = RbPath.of("fake", "two", "four", "one", "three");
+        // This shows that the capturing of arguments done on the resource bundle path for the
+        // fallback correctly reordered the arguments. Having this many reordered arguments in a
+        // fallback is not something that really happens in the actual config files currently, but
+        // it's complex logic and needs to be tested. Note also how captured arguments can appear
+        // multiple times in the result.
+        assertSingleResult(
+            transformer.getFallbackResultsFor(rbPath, p -> null),
+            rbPath,
+            "one", "two", "three", "four", "three", "two", "one");
+    }
+
+    @Test
+    public void testFallbacks_valueSplitting() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+            " ; /fake/$1/$2 ; fallback=$1 and $2");
+
+        RbPath rbPath = RbPath.of("fake", "Foo", "Bar");
+        ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+        assertSingleResult(fallbacks, rbPath, "Foo", "and", "Bar");
+    }
+
+    @Test
+    public void testFallbacks_missingArgs() {
+        IllegalStateException e = assertThrows(
+            IllegalStateException.class,
+            () -> transformer(
+                "%A=[^\"']++",
+                "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+                " ; /$1 ; fallback=$2"));
+        // A bit brittle, but this message is important for debugging.
+        assertThat(e).hasMessageThat()
+            .contains("fallback values may only contain arguments from the resource bundle path");
+        assertThat(e).hasMessageThat().contains("$2");
+    }
+
+    @Test
+    public void testFallbacks_noValueSubstitution() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+            " ; /$1 ; fallback=$1-{value}");
+
+        RbPath rbPath = RbPath.of("Foo");
+        ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+        // The {value} token is not substituted in a fallback because there is not value.
+        // TODO: Make this into an error (since it's only ever going to happen by mistake)!
+        assertSingleResult(fallbacks, rbPath, "Foo-{value}");
+    }
+
+    @Test
+    public void testFallbacks_noQuotingSupport() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+            " ; /fake/$1 ; fallback=\"$1\"");
+
+        RbPath rbPath = RbPath.of("fake", "Foo");
+        ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+        // Fallbacks could support quoting of placeholders, but to match legacy behaviour,
+        // they don't yet. As it is you cannot prevent fallback values being split on spaces.
+        assertSingleResult(fallbacks, rbPath, "\"Foo\"");
+    }
+
+    @Test
+    public void testHiddenLabelsAndMetazones() {
+        PathValueTransformer transformer = transformer(
+            "%A=[^\"']++",
+            "%W=[\\s\\w\\-/]++",
+            "//supplementalData/metaZones/metazoneInfo"
+                + "/timezone[@type=\"(%W)\"]/usesMetazone[@mzone=\"(%W)\"]"
+                + " ; /metazoneInfo/\"$1\"/<$2> ; values=$2",
+            "//supplementalData/metaZones/metazoneInfo"
+                + "/timezone[@type=\"(%W)\"]/usesMetazone[@to=\"(%A)\"][@mzone=\"(%W)\"]"
+                + " ; /metazoneInfo/\"$1\"/<1970-01-01 00:00> ; values=$3 \"1970-01-01 00:00\" \"$2\"");
+
+        ImmutableList<Result> parisTz = transformPath(
+            transformer,
+            "//supplementalData/metaZones/metazoneInfo"
+                + "/timezone[@type=\"Europe/Paris\"]/usesMetazone[@mzone=\"Europe_Central\"]");
+
+        // The conversion from "Europe/Paris" to "Europe:Paris" is a built in special case when
+        // quoting values with '/' in. It's only actually necessary for these timezone identifiers,
+        // but the code is applied everywhere since that's easier. Ideally there'd be something
+        // like the function calling mechanism to make this transformation explicit, but at the
+        // moment, the output resource bunder paths have no way to control the transformation of
+        // substituted arguments, so it has to be built in.
+        assertSingleResult(
+            parisTz, "/metazoneInfo/\"Europe:Paris\"/<Europe_Central>", "Europe_Central");
+
+        ImmutableList<Result> britishTz = transformPath(
+            transformer,
+            "//supplementalData/metaZones/metazoneInfo"
+                + "/timezone[@type=\"Europe/London\"]"
+                + "/usesMetazone[@to=\"1971-10-31 02:00\"][@mzone=\"Europe_Central\"]");
+
+        // This example demonstrates that things like ' ' or ':' (normally prohibited in resource
+        // bundle path elements) are acceptable in hidden labels, since those will be stripped out
+        // while writing the resulting data file. The date-time values are quoted in the rule to
+        // ensure they are not split.
+        assertSingleResult(
+            britishTz,
+            "/metazoneInfo/\"Europe:London\"/<1970-01-01 00:00>",
+            "Europe_Central", "1970-01-01 00:00", "1971-10-31 02:00");
+    }
+
+    @Test
+    public void testBaseXpath() {
+        PathValueTransformer transformer = transformer(
+            "%W=[\\s\\w\\-/]++",
+            "%N=[\\d\\.]++",
+
+            // In the real data, these rules define multiple results which reflect the actual
+            // differences in the child elements, but the one tested is is only based on the
+            // <territory> path prefix, which is the same for many child elements (which is all
+            // that's ever actually transformed).
+            //
+            // So for a single path prefix you'll generate multiple identical results which need
+            // to be de-duplicated, which can only happen if they are considered to have come
+            // from the same source (since duplicate results happen all the time in general).
+            //
+            // This is what the base xpath does, it fakes a different source CLDR path which makes
+            // the results "equal" (even though they came from different CLDR paths sources).
+            "//supplementalData/territoryInfo"
+                + "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+                + "/languagePopulation[@type=\"(%W)\"][@populationPercent=\"(%N)\"]",
+            " ; /territoryInfo/$1/territoryF:intvector"
+                + " ; values=$2 $3 $4"
+                + " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]",
+
+            // Same thing but with child element containing "writingPercent".
+            "//supplementalData/territoryInfo"
+                + "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+                + "/languagePopulation[@type=\"(%W)\"][@writingPercent=\"(%N)\"][@populationPercent=\"(%N)\"]",
+            " ; /territoryInfo/$1/territoryF:intvector"
+                + " ; values=$2 $3 $4"
+                + " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]");
+
+        String commonPrefix =
+            "//supplementalData/territoryInfo"
+                + "/territory[@type=\"CI\"][@gdp=\"97160000000\"][@literacyPercent=\"57\"][@population=\"26260600\"]";
+
+        ImmutableList<Result> firstResult = transformPath(
+            transformer,
+            commonPrefix + "/languagePopulation[@type=\"kfo\"][@populationPercent=\"0.3\"]");
+
+        ImmutableList<Result> secondResult = transformPath(
+            transformer,
+            commonPrefix + "/languagePopulation[@type=\"sef\"][@writingPercent=\"5\"][@populationPercent=\"4\"]");
+
+        assertSingleResult(
+            firstResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
+        assertSingleResult(
+            secondResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
+
+        // Even though they come from different rules, these results are treated as interchangeably
+        // equal because the base path is the same. Without the base path this would not be equal.
+        assertThat(firstResult).isEqualTo(secondResult);
+    }
+
+    @Test
+    public void testResultGrouping() {
+        PathValueTransformer transformer = transformer(
+            "%W=[\\w\\-/]++",
+            "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol ; /Currencies/$1",
+            "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/decimal ; /Currencies/$1 ; group");
+
+        Result ungrouped = transformSingleResult(
+            transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/symbol", "$");
+        Result grouped = transformSingleResult(
+            transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/decimal", ".");
+
+        // Note that grouping is important for some data, but isn't very interesting at the basic
+        // transformation level (it's just a bit). It's only interesting when the converter
+        // combines multiple results together.
+        assertThat(ungrouped).isGrouped(false);
+        assertThat(grouped).isGrouped(true);
+    }
+
+    private static PathValueTransformer transformer(String... configLines) {
+        return RegexTransformer.fromConfigLines(asList(configLines));
+    }
+
+    private static ImmutableList<Result> transformPath(
+        PathValueTransformer transformer, String cldrPath) {
+
+        return transformer.transform(CldrValue.parseValue(cldrPath, ""));
+    }
+
+    private static Result transformSingleResult(
+        PathValueTransformer transformer, String path, String value) {
+
+        ImmutableList<Result> results =
+            transformer.transform(CldrValue.parseValue(path, value));
+        assertThat(results).hasSize(1);
+        return results.get(0);
+    }
+
+    private static void assertSingleResult(List<Result> results, RbPath path, String... values) {
+        assertThat(results).hasSize(1);
+        assertThat(results.get(0)).isGrouped(false);
+        assertThat(results.get(0)).hasKey(path);
+        assertThat(results.get(0)).hasValues(values);
+    }
+
+    private static void assertSingleResult(List<Result> results, String path, String... values) {
+        assertSingleResult(results, RbPath.parse(path), values);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/AssertUtils.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/AssertUtils.java

new file mode 100644 (file)

index 0000000..d807fb1
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/AssertUtils.java
@@ -0,0 +1,29 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static org.junit.Assert.fail;
+
+/** Static assertion helpers (some of which can be removed if JUnit version is updated). */
+public final class AssertUtils {
+    // Functional interface acting as a lambda target.
+    public interface CheckedRunnable<T extends Throwable> {
+        void run() throws T;
+    }
+
+    /** Asserts that an exception is thrown by a given runnable. */
+    public static <T extends Throwable> T assertThrows(Class<T> cls, CheckedRunnable<T> fn) {
+        try {
+            fn.run();
+        } catch (Throwable t) {
+            if (cls.isInstance(t)) {
+                return cls.cast(t);
+            }
+            fail("expected " + cls.getName() + " but got " + t.getClass().getName());
+        }
+        fail("expected " + cls.getName() + " but nothing was thrown");
+        throw new AssertionError("unreachable!");
+    }
+
+    private AssertUtils() {}
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubject.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubject.java

new file mode 100644 (file)

index 0000000..ed12342
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubject.java
@@ -0,0 +1,33 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+public final class RbPathSubject extends Subject {
+    // For use when chaining from other subjects.
+    public static Subject.Factory<RbPathSubject, RbPath> rbPaths() {
+        return RbPathSubject::new;
+    }
+
+    private final RbPath actual;
+
+    protected RbPathSubject(FailureMetadata metadata, RbPath actual) {
+        super(metadata, actual);
+        this.actual = actual;
+    }
+
+    /** Asserts the value of the path, as segments (use this if a segment can contain '/'). */
+    public final void hasSegments(String... segments) {
+        check("<segments>").that(actual).isEqualTo(RbPath.of(segments));
+    }
+
+    public final void hasLength(int n) {
+        checkArgument(n >= 0, "invalid path length: %s", n);
+        check("length()").that(actual.length()).isEqualTo(n);
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubjectFactory.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubjectFactory.java

new file mode 100644 (file)

index 0000000..537b4bb
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubjectFactory.java
@@ -0,0 +1,22 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import com.google.common.truth.Truth;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/** Truth subject for asserting about resource bundle paths (makes tests much more readable). */
+public final class RbPathSubjectFactory implements Subject.Factory<RbPathSubject, RbPath> {
+    public static RbPathSubject assertThat(RbPath result) {
+        return Truth.assertAbout(new RbPathSubjectFactory()).that(result);
+    }
+
+    @Override
+    public RbPathSubject createSubject(FailureMetadata failureMetadata, RbPath that) {
+        return new RbPathSubject(failureMetadata, that);
+    }
+
+    RbPathSubjectFactory() {}
+}
+\ No newline at end of file
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubject.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubject.java

new file mode 100644 (file)

index 0000000..dc23ee0
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubject.java
@@ -0,0 +1,53 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+import com.google.common.truth.ComparableSubject;
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.IterableSubject;
+import com.google.common.truth.Subject;
+
+public final class ResultSubject extends ComparableSubject<Result> {
+    // For use when chaining from other subjects.
+    public static Subject.Factory<ResultSubject, Result> results() {
+        return ResultSubject::new;
+    }
+
+    private final Result actual;
+
+    protected ResultSubject(FailureMetadata metadata, Result result) {
+        super(metadata, checkNotNull(result));
+        this.actual = result;
+    }
+
+    public final void isGrouped(boolean grouped) {
+        if (grouped != actual.isGrouped()) {
+            check("isGrouped()").that(actual.isGrouped()).isEqualTo(grouped);
+        }
+    }
+
+    public final IterableSubject hasValueListThat() {
+        return check("getValues()").that(actual.getValues());
+    }
+
+    public final void hasValues(String... values) {
+        hasValueListThat().containsExactlyElementsIn(values);
+    }
+
+    public final RbPathSubject hasKeyThat() {
+        return check("getKey()").about(RbPathSubject.rbPaths()).that(actual.getKey());
+    }
+
+    public final void hasKey(RbPath path) {
+        hasKeyThat().isEqualTo(path);
+    }
+
+    public final void hasKey(String path) {
+        hasKey(RbPath.parse(path));
+    }
+}
diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubjectFactory.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubjectFactory.java

new file mode 100644 (file)

index 0000000..a8a2f8f
--- /dev/null
+++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubjectFactory.java
@@ -0,0 +1,22 @@
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import com.google.common.truth.Truth;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+
+/** Truth subject for asserting about transformation results (makes tests much more readable). */
+public class ResultSubjectFactory implements Subject.Factory<ResultSubject, Result> {
+    public static ResultSubject assertThat(Result result) {
+        return Truth.assertAbout(new ResultSubjectFactory()).that(result);
+    }
+
+    @Override
+    public ResultSubject createSubject(FailureMetadata failureMetadata, Result that) {
+        return new ResultSubject(failureMetadata, that);
+    }
+
+    private ResultSubjectFactory() {}
+}
+\ No newline at end of file
author	David Beaumont <dbeaumont@google.com>
	Sat, 24 Aug 2019 15:14:52 +0000 (15:14 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 27 Aug 2019 17:28:01 +0000 (10:28 -0700)
tools/cldr/cldr-to-icu/.gitignore	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/README.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/lib/README.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/pom.xml	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuData.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuDataDumper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuFunctions.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathValueTransformer.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbPath.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/RbValue.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/LocaleMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Instruction.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/NamedFunction.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformer.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/ResultSpec.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/Rule.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/RuleParser.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/regex/VarString.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_supplemental.txt	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/RbPathTest.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/regex/RegexTransformerTest.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/AssertUtils.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubject.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/RbPathSubjectFactory.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubject.java	[new file with mode: 0644]	patch \| blob
tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/testing/ResultSubjectFactory.java	[new file with mode: 0644]	patch \| blob