--- /dev/null
+# Exclude the Maven local repository but keep the lib directory and the top-level readme.
+/lib/**
+!/lib/README.txt
+
+# Ignore the default Maven target directory.
+/target
+
--- /dev/null
+*********************************************************************
+*** © 2019 and later: Unicode, Inc. and others. ***
+*** License & terms of use: http://www.unicode.org/copyright.html ***
+*********************************************************************
+
+Basic instructions for running the LdmlConverter via Maven
+==========================================================
+
+Note that these instructions do not currently support configuration of the converter for things
+such as limiting the set of files produced. That is supported in code and could be easily added
+to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
+See the IcuConverterConfig class for the API by which this can be supported.
+
+
+Important directories
+---------------------
+
+<CLDR_DIR> = The root directory of the CLDR release.
+
+<ICU_DIR> = The root directory of the ICU release (probably a parent directory of where
+ this README file is located). This is an optional property and defaults to
+ the parent directory of the release from which it is run.
+
+<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
+ same directory as would be used when running tools from the CLDR project).
+ Note that the need to specify this directory is scheduled to be removed after
+ ICU release 65.
+
+<OUT_DIR> = The output directory into which ICU data files should be written.
+
+
+Generating all ICU data
+-----------------------
+
+$ mvn exec:java \
+ -DCLDR_DIR='<CLDR_DIR>' \
+ -DCLDR_DTD_CACHE='<DTD_CACHE>' \
+ -Dexec.args='<OUT_DIR>'
+
+
+Running unit tests
+------------------
+
+$ mvn test \
+ -DCLDR_DIR='<CLDR_DIR>' \
+ -DCLDR_DTD_CACHE='<DTD_CACHE>'
+
+
+Importing and running from an IDE
+---------------------------------
+
+This project should be easy to import into an IDE which supports Maven development, such
+as IntelliJ or Eclipse. It uses a local Maven repository directory for the unpublished
+CLDR libraries (which are included in the project), but otherwise gets all dependencies
+via Maven's public repositories.
\ No newline at end of file
--- /dev/null
+*********************************************************************
+*** © 2019 and later: Unicode, Inc. and others. ***
+*** License & terms of use: http://www.unicode.org/copyright.html ***
+*********************************************************************
+
+What is this directory and why is it empty?
+-------------------------------------------
+
+This is the root of a local Maven repository which needs to be populated before the
+code in this project can be executed.
+
+To do this, you need to have a local copy of the CLDR project configured on your
+computer and be able able to build the API jar file and copy an existing utility
+jar file. In the examples below it is assumed that <CLDR_ROOT> references this CLDR
+release.
+
+
+Regenerating the CLDR API jar
+-----------------------------
+
+To regenerate the CLDR API jar you need to build the "jar" target using the Ant
+build.xml file in the "tools/java" directory of the CLDR project:
+
+$ cd <CLDR_ROOT>/tools/java
+$ ant clean jar
+
+This should result in the cldr.jar file being built into that directory, which can then
+be installed as a Maven dependency as described above.
+
+
+Updating local Maven repository
+-------------------------------
+
+To update the local Maven repository (e.g. to install the CLDR jar) then from this
+directory (lib/) you should run:
+
+$ mvn install:install-file \
+ -DgroupId=org.unicode.cldr \
+ -DartifactId=cldr-api \
+ -Dversion=0.1-SNAPSHOT \
+ -Dpackaging=jar \
+ -DgeneratePom=true \
+ -DlocalRepositoryPath=. \
+ -Dfile=<CLDR_ROOT>/tools/java/cldr.jar
+
+And also (for the utility jar):
+
+$ mvn install:install-file \
+ -DgroupId=com.ibm.icu \
+ -DartifactId=icu-utilities \
+ -Dversion=0.1-SNAPSHOT \
+ -Dpackaging=jar \
+ -DgeneratePom=true \
+ -DlocalRepositoryPath=. \
+ -Dfile=<CLDR_ROOT>/tools/java/libs/utilities.jar
+
+And if you have updated one of these libraries, run:
+
+$ mvn dependency:purge-local-repository -DsnapshotsOnly=true
+
+If you choose to update the version number, then remember to update the root pom.xml.
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- © 2019 and later: Unicode, Inc. and others.
+ License & terms of use: http://www.unicode.org/copyright.html
+ See README.txt for instructions on updating the local repository.
+ -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.unicode.icu</groupId>
+ <artifactId>cldr-to-icu</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.5.1</version>
+ <configuration>
+ <source>8</source>
+ <target>8</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <configuration>
+ <mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
+ <systemProperties>
+ <property>
+ <key>ICU_DIR</key>
+ <value>${project.basedir}/../../..</value>
+ </property>
+ </systemProperties>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
+ <repositories>
+ <repository>
+ <id>local-maven-repo</id>
+ <url>file:///${project.basedir}/lib</url>
+ </repository>
+ </repositories>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.unicode.cldr</groupId>
+ <artifactId>cldr-api</artifactId>
+ <version>0.1-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu-utilities</artifactId>
+ <version>0.1-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>64.2</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>27.1-jre</version>
+ </dependency>
+ <dependency>
+ <groupId>com.google.truth</groupId>
+ <artifactId>truth</artifactId>
+ <version>1.0</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.truth.extensions</groupId>
+ <artifactId>truth-java8-extension</artifactId>
+ <version>1.0</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrDraftStatus;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
+
+/**
+ * The converter config intended to generate the standard ICU data files. This used to be something
+ * that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
+ */
+public final class IcuConverterConfig implements LdmlConverterConfig {
+
+ private static final Optional<Path> DEFAULT_CLDR_DIR =
+ Optional.ofNullable(System.getProperty("CLDR_DIR", null))
+ .map(d -> Paths.get(d).toAbsolutePath());
+
+ private static final Optional<Path> DEFAULT_ICU_DIR =
+ Optional.ofNullable(System.getProperty("ICU_DIR", null))
+ .map(d -> Paths.get(d).toAbsolutePath());
+
+ /** The builder with which to specify configuration for the {@link LdmlConverter}. */
+ public static final class Builder {
+ private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
+ private Path outputDir =
+ DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
+ private Path specialsDir =
+ DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
+ private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
+ private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
+ private boolean emitReport = false;
+
+ /**
+ * Sets the CLDR base directory from which to load all CLDR data. This is optional if the
+ * {@code CLDR_DIR} environment variable is set, which will be used instead.
+ */
+ public Builder setCldrDir(Path cldrDir) {
+ this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
+ return this;
+ }
+
+ /**
+ * Sets the output directory in which the ICU data directories and files will go. This is
+ * optional if the {@code ICU_DIR} system property is set, which will be used to generate
+ * the path instead (i.e. {@code "icu4c/source/data"} inside the ICU release directory).
+ */
+ public Builder setOutputDir(Path outputDir) {
+ this.outputDir = checkNotNull(outputDir);
+ return this;
+ }
+
+ /**
+ * Sets the "specials" directory containing additional ICU specific data to be processed.
+ * This is optional if the {@code ICU_DIR} system property is set, which will be used to
+ * generate the path instead (i.e. {@code "icu4c/source/data/xml"} inside the ICU release
+ * directory).
+ */
+ public Builder setSpecialsDir(Path specialsDir) {
+ this.specialsDir = checkNotNull(specialsDir);
+ return this;
+ }
+
+ /**
+ * Sets the output types which will be converted. This is optional and defaults to {@link
+ * OutputType#ALL}.
+ */
+ public Builder setOutputTypes(Iterable<OutputType> types) {
+ this.outputTypes = ImmutableSet.copyOf(types);
+ return this;
+ }
+
+ /**
+ * Sets the minimum draft status for CLDR data to be converted (paths below this status are
+ * ignored during conversion). This is optional and defaults to {@link
+ * CldrDraftStatus#CONTRIBUTED}.
+ */
+ public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
+ this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
+ return this;
+ }
+
+ public Builder setEmitReport(boolean emitReport) {
+ this.emitReport = emitReport;
+ return this;
+ }
+
+ /** Returns a converter config from the current builder state. */
+ public LdmlConverterConfig build() {
+ return new IcuConverterConfig(this);
+ }
+ }
+
+ private final Path cldrDir;
+ private final Path outputDir;
+ private final Path specialsDir;
+ private final ImmutableSet<OutputType> outputTypes;
+ private final CldrDraftStatus minimalDraftStatus;
+ private final boolean emitReport;
+
+ private IcuConverterConfig(Builder builder) {
+ this.cldrDir = checkNotNull(builder.cldrDir,
+ "must set a CLDR directory, or the CLDR_DIR system property");
+ if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
+ System.err.format(
+ "Warning: Specified CLDR base directory does not appear to match the"
+ + " directory inferred by the 'CLDR_DIR' system property.\n"
+ + "Specified: %s\n"
+ + "Inferred: %s\n",
+ this.cldrDir, DEFAULT_CLDR_DIR.get());
+ }
+ this.outputDir = checkNotNull(builder.outputDir);
+ checkArgument(!Files.isRegularFile(outputDir),
+ "specified output directory if not a directory: %s", outputDir);
+ this.specialsDir = checkNotNull(builder.specialsDir,
+ "must specify a 'specials' XML directory");
+ checkArgument(Files.isDirectory(specialsDir),
+ "specified specials directory does not exist: %s", specialsDir);
+ this.outputTypes = builder.outputTypes;
+ checkArgument(!this.outputTypes.isEmpty(),
+ "must specify at least one output type to be generated (possible values are: %s)",
+ Arrays.asList(OutputType.values()));
+ this.minimalDraftStatus = builder.minimalDraftStatus;
+ this.emitReport = builder.emitReport;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ @Override public Path getCldrDirectory() {
+ return cldrDir;
+ }
+
+ @Override public Path getOutputDir() {
+ return outputDir;
+ }
+
+ @Override public Set<OutputType> getOutputTypes() {
+ return outputTypes;
+ }
+
+ @Override public CldrDraftStatus getMinimumDraftStatus() {
+ return minimalDraftStatus;
+ }
+
+ @Override public Path getSpecialsDir() {
+ return specialsDir;
+ }
+
+ @Override public boolean emitReport() {
+ return emitReport;
+ }
+
+ // Currently hard-coded "hacks" which could be encoded via the builder if wanted.
+
+ @Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
+ switch (dir) {
+ case COLL:
+ return ImmutableMap.<String, String>builder()
+ // It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
+ // TODO: Find out and document this properly.
+ .put("sr_ME", "sr_Cyrl_ME")
+
+ // This appears to be a hack to avoid needing to copy and maintain the same "zh"
+ // data for "yue". The files for "yue" in this directory should be empty otherwise.
+ //
+ // The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
+ // "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
+ // rewriting the base language.
+ .put("yue_Hans", "zh_Hans")
+ .put("yue", "zh_Hant")
+ .build();
+ case RBNF:
+ // It is not at all clear why this is being done. It's certainly not exactly the same
+ // as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
+ // data than "yue", so this alias is not just rewriting the base language.
+ // TODO: Find out and document this properly.
+ return ImmutableMap.of("zh_Hant_HK", "yue");
+ default:
+ return ImmutableMap.of();
+ }
+ }
+
+ // This set of locale files in each directory denotes the supported/available locales for that
+ // API. In most cases, it's the same set, but a few directories support only a subset of IDs.
+ @Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
+ switch (dir) {
+ case COLL:
+ return COLL_LOCALE_IDS;
+ case BRKITR:
+ return BRKITR_LOCALE_IDS;
+ case RBNF:
+ return RBNF_LOCALE_IDS;
+ default:
+ return ICU_LOCALE_IDS;
+ }
+ }
+
+ // The primary set of locale IDs to be generated. Other, directory specific, sets should be
+ // subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
+ //
+ // This was further modified (in order to better match the set of generated ICU files) by:
+ // * Removing "es_003" (which just seems to be ignored in current code)
+ // * Adding: "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
+ // * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
+ private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
+ "root",
+ // A
+ "af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
+ "ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
+ "ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
+ "ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
+ "as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
+ "az_Latn", "az_Latn_AZ",
+ // B
+ "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
+ "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
+ "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
+ // C
+ "ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
+ "ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
+ "cs_CZ", "cy", "cy_GB",
+ // D
+ "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
+ "de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
+ "dyo_SN", "dz", "dz_BT",
+ // E
+ "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
+ "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
+ "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
+ "en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
+ "en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
+ "en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
+ "en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
+ "en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
+ "en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
+ "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
+ "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
+ "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
+ "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
+ "eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
+ "es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
+ "es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
+ "es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
+ // F
+ "fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
+ "ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
+ "ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
+ "fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
+ "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
+ "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
+ "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
+ "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
+ "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
+ "fy", "fy_NL",
+ // G
+ "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
+ "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
+ // H
+ "ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
+ "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
+ // I
+ "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
+ "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
+ // J
+ "ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
+ // K
+ "ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
+ "khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
+ "kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
+ "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
+ "kw", "kw_GB", "ky", "ky_KG",
+ // L
+ "lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
+ "ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
+ "lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
+ // M
+ "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
+ "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
+ "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
+ "mua_CM", "my", "my_MM", "mzn", "mzn_IR",
+ // N
+ "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
+ "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
+ "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
+ "nus", "nus_SS", "nyn", "nyn_UG",
+ // O
+ "om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
+ // P
+ "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
+ "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
+ "pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
+ // Q
+ "qu", "qu_BO", "qu_EC", "qu_PE",
+ // R
+ "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
+ "ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
+ // S
+ "sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
+ "se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
+ "sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
+ "shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
+ "sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
+ "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
+ "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
+ "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
+ "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
+ // T
+ "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
+ "tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
+ "tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
+ "twq", "twq_NE", "tzm", "tzm_MA",
+ // U
+ "ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
+ "uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
+ // V
+ "vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
+ "vi_VN", "vun", "vun_TZ",
+ // W
+ "wae", "wae_CH", "wo", "wo_SN",
+ // X
+ "xh", "xh_ZA", "xog", "xog_UG",
+ // Y
+ "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
+ "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
+ // Z
+ "zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
+ "zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
+ "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
+
+ private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
+ "root",
+ // A-B
+ "af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
+ // C-F
+ "ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
+ "en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
+ // G-J
+ "ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
+ "id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
+ // K-P
+ "ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
+ "mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
+ "om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
+ // R-T
+ "ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
+ "sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
+ "ta", "te", "th", "tk", "to", "tr",
+ // U-Z
+ "ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
+ "yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
+
+ private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
+ "root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
+ "zh_Hant", "zh");
+
+ private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
+ "root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
+ "cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
+ "es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
+ "fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
+ "hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
+ "lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
+ "pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
+ "sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
+ "zh_MO", "zh_TW", "zh");
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ListMultimap;
+
+/**
+ * Mutable ICU data, represented as a mapping from resource bundle paths to a sequence of values.
+ */
+public final class IcuData {
+ private static final RbPath RB_VERSION = RbPath.of("Version");
+ private static final Pattern ARRAY_INDEX = Pattern.compile("(/[^\\[]++)(?:\\[(\\d++)\\])?$");
+
+ private final String name;
+ private final boolean hasFallback;
+ private final NavigableSet<RbPath> paths = new TreeSet<>();
+ private final ListMultimap<RbPath, RbValue> rbPathToValues = ArrayListMultimap.create();
+ private ImmutableList<String> commentLines = ImmutableList.of();
+
+ /**
+ * IcuData constructor.
+ *
+ * @param name The name of the IcuData object, used as the name of the root node in the output file
+ * @param hasFallback true if the output file has another ICU file as a fallback.
+ */
+ public IcuData(String name, boolean hasFallback) {
+ this.hasFallback = hasFallback;
+ this.name = name;
+ }
+
+ /** @return whether data should fallback on data in other ICU files. */
+ public boolean hasFallback() {
+ return hasFallback;
+ }
+
+ /**
+ * @return the name of this ICU data instance. Used in the output filename, and in comments.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /** Sets additional comment lines for the top of the file. */
+ public void setFileComment(String... commentLines) {
+ setFileComment(Arrays.asList(commentLines));
+ }
+
+ public void setFileComment(Iterable<String> commentLines) {
+ this.commentLines = ImmutableList.copyOf(commentLines);
+ }
+
+ public List<String> getFileComment() {
+ return commentLines;
+ }
+
+ /** Adds a singleton resource bundle value for a given path. */
+ public void add(RbPath rbPath, String element) {
+ add(rbPath, RbValue.of(element));
+ }
+
+ /** Adds a single resource bundle value for a given path. */
+ public void add(RbPath rbPath, RbValue rbValue) {
+ rbPathToValues.put(rbPath, rbValue);
+ paths.add(rbPath);
+ }
+
+ /** Adds a sequence of resource bundle values for a given path. */
+ public void add(RbPath rbPath, Iterable<RbValue> rbValues) {
+ rbValues.forEach(v -> rbPathToValues.put(rbPath, v));
+ paths.add(rbPath);
+ }
+
+ /** Replaces all resource bundle values for a given path with the specified singleton value. */
+ public void replace(RbPath rbPath, String element) {
+ rbPathToValues.removeAll(rbPath);
+ rbPathToValues.put(rbPath, RbValue.of(element));
+ paths.add(rbPath);
+ }
+
+ /** Replaces all resource bundle values for a given path with the specified value. */
+ public void replace(RbPath rbPath, RbValue rbValue) {
+ rbPathToValues.removeAll(rbPath);
+ add(rbPath, rbValue);
+ }
+
+ public void setVersion(String versionString) {
+ add(RB_VERSION, versionString);
+ }
+
+ public void addResults(ListMultimap<RbPath, PathValueTransformer.Result> resultsByRbPath) {
+ for (RbPath rbPath : resultsByRbPath.keySet()) {
+ for (PathValueTransformer.Result r : resultsByRbPath.get(rbPath)) {
+ if (r.isGrouped()) {
+ // Grouped results have all the values in a single value entry.
+ add(rbPath, RbValue.of(r.getValues()));
+ } else {
+ if (rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")) {
+ r.getValues().forEach(v -> add(rbPath, RbValue.of(v)));
+ } else {
+ // Ungrouped results are one value per entry, but might be expanded into
+ // grouped results if they are a path referencing a grouped entry.
+ r.getValues().forEach(v -> add(rbPath, replacePathValues(v)));
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Replaces an ungrouped CLDR value for the form "/foo/bar" or "/foo/bar[N]" which is assumed
+ * to be a reference to an existing value in a resource bundle. Note that the referenced bundle
+ * might be grouped (i.e. an array with more than one element).
+ */
+ private RbValue replacePathValues(String value) {
+ Matcher m = ARRAY_INDEX.matcher(value);
+ if (!m.matches()) {
+ return RbValue.of(value);
+ }
+ // The only constraint is that the "path" value starts with a leading '/', but parsing into
+ // the RbPath ignores this. We must use "parse()" here, rather than RbPath.of(), since the
+ // captured value contains '/' characters to represent path delimiters.
+ RbPath replacePath = RbPath.parse(m.group(1));
+ List<RbValue> replaceValues = get(replacePath);
+ checkArgument(replaceValues != null, "Path %s is missing from IcuData", replacePath);
+ // If no index is given (e.g. "/foo/bar") then treat it as index 0 (i.e. "/foo/bar[0]").
+ int replaceIndex = m.groupCount() > 1 ? Integer.parseInt(m.group(2)) : 0;
+ return replaceValues.get(replaceIndex);
+ }
+
+ /**
+ * Returns the mutable list of values associated with the given path (or null if there are no
+ * associated values).
+ */
+ public List<RbValue> get(RbPath rbPath) {
+ return paths.contains(rbPath) ? rbPathToValues.get(rbPath) : null;
+ }
+
+ /** Returns an unmodifiable view of the set of paths in this instance. */
+ public Set<RbPath> getPaths() {
+ return Collections.unmodifiableSet(paths);
+ }
+
+ /** Returns whether the given path is present in this instance. */
+ public boolean contains(RbPath rbPath) {
+ return paths.contains(rbPath);
+ }
+
+ /** Returns whether there are any paths in this instance. */
+ public boolean isEmpty() {
+ return paths.isEmpty();
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkElementIndex;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSetMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+
+/**
+ * Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
+ * ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
+ * extended to be a more fully featured "diff" tool or a proper ICU data file parser.
+ *
+ * <p>This is a temporary debugging tool and should not be relied upon during any part of the data
+ * generation process.
+ */
+final class IcuDataDumper {
+ private static final Joiner LIST_JOINER = Joiner.on(',');
+ private static final RbPath VERSION = RbPath.of("Version");
+
+ public static void main(String... args) throws IOException {
+ Path fileOrDir;
+ Optional<Pattern> name = Optional.empty();
+ switch (args.length) {
+ case 2:
+ name = Optional.of(Pattern.compile(args[1]));
+ case 1:
+ fileOrDir = Paths.get(args[0]);
+ break;
+ default:
+ throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
+ }
+
+ if (Files.isDirectory(fileOrDir)) {
+ walkDirectory(fileOrDir, name);
+ } else {
+ checkArgument(!name.isPresent(),
+ "cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
+ IcuDataParser parser = new IcuDataParser(fileOrDir);
+ parser.parse();
+ dump(parser.icuData);
+ }
+ }
+
+ private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
+ Predicate<Path> matchesName =
+ f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
+ List<IcuDataParser> icuParsers;
+ try (Stream<Path> files = Files.walk(fileOrDir)) {
+ icuParsers = files
+ .filter(Files::isRegularFile)
+ .filter(matchesName)
+ .map(IcuDataParser::new)
+ .collect(toImmutableList());
+ }
+ ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
+ for (IcuDataParser p : icuParsers) {
+ p.parse();
+ for (RbPath k : p.icuData.keySet()) {
+ List<RbValue> values = p.icuData.get(k);
+ if (!allPaths.containsKey(k)) {
+ allPaths.putAll(k, values);
+ } else if (!VERSION.equals(k)) {
+ checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
+ }
+ }
+ }
+ dump(allPaths);
+ }
+
+ private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
+ allPaths.keySet().stream()
+ .sorted()
+ .forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
+ }
+
+ private static final class IcuDataParser {
+ // Path of file being parsed.
+ private final Path path;
+
+ // Comments in header (before data starts), without comment characters.
+ private final List<String> headerComment = new ArrayList<>();
+ // ICU data name (the name of the root element).
+ private String name = null;
+ // ICU data values.
+ private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
+
+ // Current line number (1-indexed).
+ private int lineNumber = 0;
+ // The type of the previous line that was processed.
+ private LineType lastType = LineType.COMMENT;
+ // True when inside /* .. */ comments in the header.
+ private boolean inBlockComment = false;
+ // True when in the final top-level group at the end of parsing.
+ private boolean inFinalGroup = false;
+ // True when a partial (line wrapped) value has been read.
+ private boolean isLineContinuation = false;
+ // Current path while parsing (NOT including the root element).
+ private Deque<String> pathStack = new ArrayDeque<>();
+ // Current sequence of values for the path (as defined in the current path stack).
+ private List<String> currentValue = new ArrayList<>();
+ // Current partially read value of a multi-line value.
+ private String wrappedValue = "";
+ // Map of indices used to auto-generate names for anonymous path segments.
+ // TODO: Check if this is even needed and remove if not.
+ private Multiset<Integer> indices = HashMultiset.create();
+
+ IcuDataParser(Path path) {
+ this.path = checkNotNull(path);
+ }
+
+ public boolean parse() throws IOException {
+ List<String> lines = Files.readAllLines(path);
+ // Best approximation to a magic number be have (BOM plus inline comment). This stops
+ // use trying to parse the transliteration files, which are a different type.
+ if (!lines.get(0).startsWith("\uFEFF//")) {
+ return false;
+ }
+ lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
+
+ // Sanity check for expected final state. Just checking the "lastType" should be enough
+ // to catch everything else (due to transition rules and how the code tidies up) but it
+ // seems prudent to sanity check everything just in case.
+ checkState(lastType == LineType.GROUP_END);
+ checkState(!inBlockComment);
+ checkState(name != null);
+ checkState(pathStack.isEmpty() && inFinalGroup);
+ checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
+ return true;
+ }
+
+ void processLineWithCheck(String line) {
+ lineNumber++;
+ if (lineNumber == 1 && line.startsWith("\uFEFF")) {
+ line = line.substring(1);
+ }
+ try {
+ processLine(line);
+ } catch (RuntimeException e) {
+ throw new RuntimeException(
+ String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
+ e);
+ }
+ }
+
+ void processLine(String line) {
+ line = maybeTrimEndOfLineComment(line);
+ if (line.isEmpty()) {
+ return;
+ }
+ LineMatch match = LineType.match(line, inBlockComment);
+ checkState(match.getType().isValidTransitionFrom(lastType),
+ "invalid state transition: %s --//-> %s", lastType, match.getType());
+ boolean isEndOfWrappedValue = false;
+ switch (match.getType()) {
+ case COMMENT:
+ if (name != null) {
+ // Comments in data are ignored since they cannot be properly associated with
+ // paths or values in an IcuData instance (only legacy tooling emits these).
+ break;
+ }
+ if (line.startsWith("/*")) {
+ inBlockComment = true;
+ }
+ headerComment.add(match.get(0));
+ if (inBlockComment && line.contains("*/")) {
+ checkState(line.indexOf("*/") == line.length() - 2,
+ "unexpected end of comment block");
+ inBlockComment = false;
+ }
+ break;
+
+ case INLINE_VALUE:
+ icuData.put(
+ getPathFromStack().extendBy(getSegment(match.get(0))),
+ RbValue.of(unquote(match.get(1))));
+ break;
+
+ case GROUP_START:
+ checkState(currentValue.isEmpty());
+ if (name == null) {
+ name = match.get(0);
+ checkState(name != null, "cannot have anonymous top-level group");
+ } else {
+ pathStack.push(getSegment(match.get(0)));
+ }
+ wrappedValue = "";
+ isLineContinuation = false;
+ break;
+
+ case QUOTED_VALUE:
+ wrappedValue += unquote(match.get(0));
+ isLineContinuation = !line.endsWith(",");
+ if (!isLineContinuation) {
+ currentValue.add(wrappedValue);
+ wrappedValue = "";
+ }
+ break;
+
+ case VALUE:
+ checkState(!isLineContinuation, "unexpected unquoted value");
+ currentValue.add(match.get(0));
+ break;
+
+ case GROUP_END:
+ // Account for quoted values without trailing ',' just before group end.
+ if (isLineContinuation) {
+ currentValue.add(wrappedValue);
+ isLineContinuation = false;
+ }
+ // Emit the collection sequence of values for the current path as an RbValue.
+ if (!currentValue.isEmpty()) {
+ icuData.put(getPathFromStack(), RbValue.of(currentValue));
+ currentValue.clear();
+ }
+ // Annoyingly the name is outside the stack so the stack will empty before the last
+ // end group.
+ if (!pathStack.isEmpty()) {
+ pathStack.pop();
+ indices.setCount(pathStack.size(), 0);
+ } else {
+ checkState(!inFinalGroup, "unexpected group end");
+ inFinalGroup = true;
+ }
+ break;
+
+ case UNKNOWN:
+ throw new IllegalStateException("cannot parse line: " + match.get(0));
+ }
+ lastType = match.getType();
+ }
+
+ private RbPath getPathFromStack() {
+ if (pathStack.isEmpty()) {
+ return RbPath.empty();
+ }
+ List<String> segments = new ArrayList<>();
+ Iterables.addAll(segments, pathStack);
+ if (segments.get(0).matches("<[0-9]{4}>")) {
+ segments.remove(0);
+ }
+ return segments.isEmpty() ? RbPath.empty() : RbPath.of(Lists.reverse(segments));
+ }
+
+ private String getSegment(String segmentOrNull) {
+ if (segmentOrNull != null) {
+ return segmentOrNull;
+ }
+ int depth = pathStack.size();
+ int index = indices.count(depth);
+ indices.add(depth, 1);
+ return String.format("<%04d>", index);
+ }
+
+ private String maybeTrimEndOfLineComment(String line) {
+ // Once the name is set, we are past the header and into the data.
+ if (name != null) {
+ // Index to search for '//' from - must skip quoted values.
+ int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
+ int commentIdx = line.indexOf("//", startIdx);
+ if (commentIdx != -1) {
+ line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
+ }
+ }
+ return line;
+ }
+
+ private static String unquote(String s) {
+ if (s.startsWith("\"") && s.endsWith("\"")) {
+ return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
+ }
+ checkState(!s.contains("\""), "invalid unquoted value: %s", s);
+ return s;
+ }
+
+ private static final class LineMatch {
+ private final LineType type;
+ private final Function<Integer, String> args;
+
+ LineMatch(LineType type, Function<Integer, String> args) {
+ this.type = checkNotNull(type);
+ this.args = checkNotNull(args);
+ }
+
+ String get(int n) {
+ return args.apply(n);
+ }
+
+ LineType getType() {
+ return type;
+ }
+ }
+
+ private enum LineType {
+ // Comment _start_ with any comment value captured.
+ COMMENT("(?://|/\\*)\\s*(.*)"),
+ // A combination of GROUP_START, VALUE and GROUP_END with whitespace.
+ INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
+ // Allows for empty segment names (anonymous arrays) which match 'null'.
+ GROUP_START("(?:(.*\\S)\\s*)?\\{"),
+ GROUP_END("\\}"),
+ QUOTED_VALUE("(\".*\"),?"),
+ VALUE("([^\"{}]+),?"),
+ UNKNOWN(".*");
+
+ // Table of allowed transitions expected during parsing.
+ // key=current state, values=set of permitted previous states
+ private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
+ ImmutableSetMultimap.<LineType, LineType>builder()
+ .putAll(COMMENT, COMMENT)
+ .putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
+ .putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
+ .putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
+ .putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
+ .putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
+ .build();
+
+ private final Pattern pattern;
+
+ LineType(String regex) {
+ this.pattern = Pattern.compile(regex);
+ }
+
+ boolean isValidTransitionFrom(LineType lastType) {
+ return TRANSITIONS.get(this).contains(lastType);
+ }
+
+ static LineMatch match(String line, boolean inBlockComment) {
+ // Block comments kinda suck and it'd be great if the ICU data only used '//' style
+ // comments (if would definitely simplify any parsers out there). Once the
+ // transition to the new transformation tools is complete, they can be changed to
+ // only emit '//' style comments.
+ if (inBlockComment) {
+ if (line.startsWith("*")) {
+ line = whitespace().trimLeadingFrom(line.substring(1));
+ }
+ return new LineMatch(COMMENT, ImmutableList.of(line)::get);
+ }
+ for (LineType type : TRANSITIONS.keySet()) {
+ // Regex groups start at 1, but we want the getter function to be zero-indexed.
+ Matcher m = type.pattern.matcher(line);
+ if (m.matches()) {
+ return new LineMatch(type, n -> {
+ checkElementIndex(n, m.groupCount());
+ return m.group(n + 1);
+ });
+ }
+ }
+ return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
+ }
+ }
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.lang.Integer.parseInt;
+
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.ZoneOffset;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Ascii;
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableMap;
+import org.unicode.icu.tool.cldrtoicu.regex.NamedFunction;
+
+/**
+ * The named functions used by the {@code RegexTransformer} for {@code ldml2icu_supplemental.txt}.
+ */
+final class IcuFunctions {
+ /**
+ * Converts an ISO date string to a space-separated pair of integer values representing the top
+ * and bottom parts of a deconstructed millisecond epoch value (i.e. {@code
+ * "<hi32bits> <low32bits>"}).
+ *
+ * <p>Note that the values are formatted as <em>signed</em> decimal values, so it's entirely
+ * possible that the low bits value will be appear as a negative number (the high bits won't
+ * appear negative for many thousands of years).
+ *
+ * <ul>
+ * <li>args[0] = ISO date string (e.g. "2019-05-23")
+ * <li>args[1] = Date field type name (e.g. "from")
+ * </ul>
+ */
+ static final NamedFunction DATE_FN =
+ NamedFunction.create("date", 2, args -> {
+ long millis =
+ DateFieldType.toEnum(args.get(1)).toEpochMillis(LocalDate.parse(args.get(0)));
+ // Strictly speaking the masking is redundant and could be removed.
+ int hiBits = (int) ((millis >>> 32) & 0xFFFFFFFFL);
+ int loBits = (int) (millis & 0xFFFFFFFFL);
+ return hiBits + " " + loBits;
+ });
+
+ // TODO(dbeaumont): Improve this documentation (e.g. why is this being done, give examples?).
+ /**
+ * Inserts '%' into numberingSystems descriptions.
+ *
+ * <ul>
+ * <li>args[0] = numbering system description (string)
+ * </ul>
+ */
+ static final NamedFunction ALGORITHM_FN =
+ NamedFunction.create("algorithm", 1, args -> {
+ String value = args.get(0);
+ int percentPos = value.lastIndexOf('/') + 1;
+ return value.substring(0, percentPos) + '%' + value.substring(percentPos);
+ });
+
+ /**
+ * Converts a number into a special integer that represents the number in normalized scientific
+ * notation for ICU's RB parser.
+ *
+ * <p>Resultant integers are in the form "xxyyyyyy", where "xx" is the exponent offset by 50
+ * and "yyyyyy" is the coefficient to 5 decimal places. Results may also have a leading '-' to
+ * denote negative values.
+ *
+ * <p>For example:
+ * <pre>{@code
+ * 14660000000000 -> 1.466E13 -> 63146600
+ * 0.0001 -> 1E-4 -> 46100000
+ * -123.456 -> -1.23456E-2 -> -48123456
+ * }</pre>
+ *
+ * <p>The additional exponent offset is applied directly to the calculated exponent and is used
+ * to do things like converting percentages into their decimal representation (i.e. by passing
+ * a value of "-2").
+ *
+ * <ul>
+ * <li>args[0] = number to be converted (double)
+ * <li>args[1] = additional exponent offset (integer)
+ * </ul>
+ */
+ static final NamedFunction EXP_FN =
+ NamedFunction.create("exp", 2, args -> {
+ double value = Double.parseDouble(args.get(0));
+ if (value == 0) {
+ return "0";
+ }
+ int exponent = 50;
+ if (args.size() == 2) {
+ exponent += Integer.parseInt(args.get(1));
+ }
+ String sign = value >= 0 ? "" : "-";
+ value = Math.abs(value);
+ while (value >= 10) {
+ value /= 10;
+ exponent++;
+ }
+ while (value < 1) {
+ value *= 10;
+ exponent--;
+ }
+ if (exponent < 0 || exponent > 99) {
+ throw new IllegalArgumentException("Exponent out of bounds: " + exponent);
+ }
+ return sign + exponent + Math.round(value * 100000);
+ });
+
+ // Allow for single digit values in any part and negative year values.
+ private static final Pattern YMD = Pattern.compile("(-?[0-9]+)-([0-9]{1,2})-([0-9]{1,2})");
+
+ /**
+ * Converts an ISO date string (i.e. "YYYY-MM-DD") into an ICU date string, which is
+ * the same but with spaces instead of hyphens. Since functions are expanded before the
+ * resulting value is split, this function will result in 3 separate values being created,
+ * unless the function call is enclosed in quotes.
+ *
+ * <p>Note that for some cases (e.g. "eras") the year part can be negative (e.g. "-2165-1-1")
+ * so this is not as simple as "split by hyphen".
+ *
+ * <ul>
+ * <li>args[0] = ISO date string (e.g. "2019-05-23" or "-2165-1-1")
+ * </ul>
+ */
+ static final NamedFunction YMD_FN =
+ NamedFunction.create("ymd", 1, args -> {
+ Matcher m = YMD.matcher(args.get(0));
+ checkArgument(m.matches(), "invalid year-month-day string: %s", args.get(0));
+ // NOTE: Re-parsing is not optional since it removes leading zeros (needed for ICU).
+ return String.format("%s %s %s",
+ parseInt(m.group(1)), parseInt(m.group(2)), parseInt(m.group(3)));
+ });
+
+ // For transforming day-of-week identifiers.
+ private static final ImmutableMap<String, String> WEEKDAY_MAP_ID =
+ ImmutableMap.<String, String>builder()
+ .put("sun", "1")
+ .put("mon", "2")
+ .put("tues", "3")
+ .put("wed", "4")
+ .put("thu", "5")
+ .put("fri", "6")
+ .put("sat", "7")
+ .build();
+
+ /**
+ * Converts a day-of-week identifier into its ordinal value (e.g. "sun" --> 1, "mon" --> 2 ...).
+ */
+ static final NamedFunction DAY_NUMBER_FN =
+ NamedFunction.create("day_number", 1,
+ args -> {
+ String id = WEEKDAY_MAP_ID.get(args.get(0));
+ checkArgument(id != null, "unknown weekday: %s", args.get(0));
+ return id;
+ });
+
+ // For transform IDs in <contextTransform> elements.
+ private static final ImmutableMap<String, String> TRANSFORM_ID_MAP =
+ ImmutableMap.of("no-change", "0", "titlecase-firstword", "1");
+
+ /**
+ * Converts the transform type in the {@code <contextTransform>} element into its ICU index
+ * (e.g. "titlecase-firstword" --> 1).
+ */
+ static final NamedFunction CONTEXT_TRANSFORM_INDEX_FN =
+ NamedFunction.create("context_transform_index", 1,
+ args -> {
+ String id = TRANSFORM_ID_MAP.get(args.get(0));
+ checkArgument(id != null, "unknown contextTransform: %s", args.get(0));
+ return id;
+ });
+
+ // For DATE_FN only.
+ private enum DateFieldType {
+ from(LocalDate::atStartOfDay),
+ // Remember that atTime() takes nanoseconds, not micro or milli.
+ to(d -> d.atTime(23, 59, 59, 999_000_000));
+
+ private final Function<LocalDate, LocalDateTime> adjustFn;
+
+ DateFieldType(Function<LocalDate, LocalDateTime> adjustFn) {
+ this.adjustFn = adjustFn;
+ }
+
+ long toEpochMillis(LocalDate date) {
+ return adjustFn.apply(date).toInstant(ZoneOffset.UTC).toEpochMilli();
+ }
+
+ static DateFieldType toEnum(String value) {
+ switch (Ascii.toLowerCase(CharMatcher.whitespace().trimFrom(value))) {
+ case "from":
+ case "start":
+ return from;
+ case "to":
+ case "end":
+ return to;
+ default:
+ throw new IllegalArgumentException(value + " is not a valid date field type");
+ }
+ }
+ }
+
+ private IcuFunctions() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.util.stream.Collectors.joining;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Writer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Writes an IcuData object to a text file. A lot of this class was copied directly from the
+ * original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
+ * behaviours. The behaviour of this class is currently tuned to produce perfect parity with
+ * the original conversion tools, but once migration of the tools is complete, it should
+ * probably be revisited and tidied up.
+ */
+// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
+final class IcuTextWriter {
+ private static final String INDENT = " ";
+ // List of characters to escape in UnicodeSets
+ // ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
+ private static final Pattern UNICODESET_ESCAPE =
+ Pattern.compile("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
+ // Only escape \ and " from other strings.
+ private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
+ private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
+
+ /** Write a file in ICU data format with the specified header. */
+ static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
+ try {
+ Files.createDirectories(outDir);
+ try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
+ PrintWriter out = new PrintWriter(w)) {
+ new IcuTextWriter(icuData).writeTo(out, header);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
+ }
+ }
+
+ private final IcuData icuData;
+ private int depth = 0;
+ private boolean valueWasInline = false;
+
+ IcuTextWriter(IcuData icuData) {
+ this.icuData = checkNotNull(icuData);
+ }
+
+ // TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
+ private void writeTo(PrintWriter out, List<String> header) throws IOException {
+ out.write('\uFEFF');
+ writeHeaderAndComments(out, header, icuData.getFileComment());
+
+ // Write the ICU data to file. This takes the form:
+ // ----
+ // <name>{
+ // foo{
+ // bar{baz}
+ // }
+ // }
+ // ----
+ // So it's like every RbPath has an implicit prefix of the IcuData name.
+ String root = icuData.getName();
+ if (!icuData.hasFallback()) {
+ root += ":table(nofallback)";
+ }
+ // TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
+ out.print(root);
+ out.print("{");
+ depth++;
+
+ RbPath lastPath = RbPath.empty();
+ for (RbPath path : icuData.getPaths()) {
+ // Close any blocks up to the common path length. Since paths are all distinct, the
+ // common length should always be shorter than either path. We add 1 since we must also
+ // account for the implicit root segment.
+ int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
+ // Before closing, the "cursor" is at the end of the last value written.
+ closeLastPath(lastPath, commonDepth, out);
+ // After opening the value will be ready for the next value to be written.
+ openNextPath(path, out);
+ valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
+ lastPath = path;
+ }
+ closeLastPath(lastPath, 0, out);
+ out.println();
+ out.close();
+ }
+
+ // Before: Cursor is at the end of the previous line.
+ // After: Cursor is positioned immediately after the last closed '}'
+ private void closeLastPath(RbPath lastPath, int minDepth, PrintWriter out) {
+ if (valueWasInline) {
+ depth--;
+ out.print('}');
+ valueWasInline = false;
+ }
+ while (depth > minDepth) {
+ close(out);
+ }
+ }
+
+ // Before: Cursor is at the end of the previous line.
+ // After: Cursor is positioned immediately after the newly opened '{'
+ private void openNextPath(RbPath path, PrintWriter out) {
+ while (depth <= path.length()) {
+ // The -1 is to adjust for the implicit root element which means indentation (depth)
+ // no longer matches the index of the segment we are writing.
+ open(path.getSegment(depth - 1), out);
+ }
+ }
+
+ private void open(String label, PrintWriter out) {
+ newLineAndIndent(out);
+ depth++;
+ // This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
+ // These take the form of "<any-string>" and are used to ensure that path order can be
+ // well defined even for anonymous lists of items.
+ if (!label.startsWith("<") && !label.endsWith(">")) {
+ out.print(label);
+ }
+ out.print('{');
+ }
+
+ private void close(PrintWriter out) {
+ depth--;
+ newLineAndIndent(out);
+ out.print('}');
+ }
+
+ private void newLineAndIndent(PrintWriter out) {
+ out.println();
+ for (int i = 0; i < depth; i++) {
+ out.print(INDENT);
+ }
+ }
+
+ // Currently the "header" uses '//' line comments but the comments are in a block.
+ // TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
+ private static void writeHeaderAndComments(
+ PrintWriter out, List<String> header, List<String> comments) {
+ header.forEach(out::println);
+ if (!comments.isEmpty()) {
+ // TODO: Don't use /* */ block quotes, just use inline // quotes.
+ out.println(
+ comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
+ }
+ }
+
+ /** Inserts padding and values between braces. */
+ private boolean appendValues(
+ String name, RbPath rbPath, List<RbValue> values, PrintWriter out) {
+
+ RbValue onlyValue;
+ boolean wasSingular = false;
+ boolean quote = !rbPath.isIntPath();
+ boolean isSequence = rbPath.endsWith(RB_SEQUENCE);
+ if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
+ onlyValue = values.get(0);
+ if (onlyValue.size() == 1 && !mustBeArray(false, name, rbPath)) {
+ // Value has a single element and is not being forced to be an array.
+ String onlyElement = onlyValue.getElement(0);
+ if (quote) {
+ onlyElement = quoteInside(onlyElement);
+ }
+ // The numbers below are simply tuned to match the line wrapping in the original
+ // CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
+ // for a single character) and could definitely be improved.
+ // TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
+ int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
+ if (onlyElement.length() <= maxWidth) {
+ // Single element for path: don't add newlines.
+ printValue(out, onlyElement, quote);
+ wasSingular = true;
+ } else {
+ // Element too long to fit in one line, so wrap.
+ int end;
+ for (int i = 0; i < onlyElement.length(); i = end) {
+ end = goodBreak(onlyElement, i + maxWidth);
+ String part = onlyElement.substring(i, end);
+ newLineAndIndent(out);
+ printValue(out, part, quote);
+ }
+ }
+ } else {
+ // Only one array for the rbPath, so don't add an extra set of braces.
+ printArray(onlyValue, quote, isSequence, out);
+ }
+ } else {
+ for (RbValue value : values) {
+ if (value.size() == 1) {
+ // Single-value array: print normally.
+ printArray(value, quote, isSequence, out);
+ } else {
+ // Enclose this array in braces to separate it from other values.
+ open("", out);
+ printArray(value, quote, isSequence, out);
+ close(out);
+ }
+ }
+ }
+ return wasSingular;
+ }
+
+ private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
+ private static final RbPath RB_RULES = RbPath.of("rules");
+ private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
+ private static final RbPath RB_ERAS = RbPath.of("eras");
+ private static final RbPath RB_NAMED = RbPath.of("named");
+ private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
+ private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
+
+ /**
+ * Wrapper for a hack to determine if the given rb path should always present its values as an
+ * array.
+ */
+ // TODO: Verify this is still needed, and either make it less hacky, or delete it.
+ private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
+ if (topValues) {
+ // matches "rules/setNN" (hence the mucking about with raw segments).
+ return name.equals("pluralRanges")
+ && rbPath.startsWith(RB_RULES)
+ && rbPath.getSegment(1).startsWith("set");
+ }
+ return rbPath.equals(RB_LOCALE_SCRIPT)
+ || (rbPath.contains(RB_ERAS)
+ && !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
+ && !rbPath.endsWith(RB_NAMED))
+ || rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
+ || rbPath.startsWith(RB_METAZONE_INFO);
+ }
+
+ private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) {
+ for (int n = 0; n < rbValue.size(); n++) {
+ newLineAndIndent(out);
+ printValue(out, quoteInside(rbValue.getElement(n)), quote);
+ if (!isSequence) {
+ out.print(",");
+ }
+ }
+ }
+
+ private static void printValue(PrintWriter out, String value, boolean quote) {
+ if (quote) {
+ out.append('"').append(value).append('"');
+ } else {
+ out.append(value);
+ }
+ }
+
+ // Can a string be broken here? If not, backup until we can.
+ // TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
+ private static int goodBreak(String quoted, int end) {
+ if (end > quoted.length()) {
+ return quoted.length();
+ }
+ // Don't break escaped Unicode characters.
+ // Need to handle both e.g. \u4E00 and \U00020000
+ for (int i = end - 1; i > end - 10;) {
+ char current = quoted.charAt(i--);
+ if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
+ if ((current == 'u' || current == 'U') && i > end - 10
+ && quoted.charAt(i) == '\\') {
+ return i;
+ }
+ break;
+ }
+ }
+ while (end > 0) {
+ char ch = quoted.charAt(end - 1);
+ if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
+ break;
+ }
+ --end;
+ }
+ return end;
+ }
+
+ // Fix characters inside strings.
+ private static String quoteInside(String item) {
+ // Unicode-escape all quotes.
+ item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
+ // Double up on backslashes, ignoring Unicode-escaped characters.
+ Pattern pattern =
+ item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
+ Matcher matcher = pattern.matcher(item);
+
+ if (!matcher.find()) {
+ return item;
+ }
+ StringBuilder buffer = new StringBuilder();
+ int start = 0;
+ do {
+ buffer.append(item, start, matcher.start());
+ int punctuationChar = item.codePointAt(matcher.end() - 1);
+ buffer.append("\\");
+ if (punctuationChar == '\\') {
+ buffer.append('\\');
+ }
+ buffer.append(matcher.group());
+ start = matcher.end();
+ } while (matcher.find());
+ buffer.append(item.substring(start));
+ return buffer.toString();
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LANG;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LOCALES;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RBNF;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
+import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
+import static java.util.stream.Collectors.toList;
+import static org.unicode.cldr.api.CldrDataType.BCP47;
+import static org.unicode.cldr.api.CldrDataType.LDML;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.LinkedListMultimap;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.SetMultimap;
+import com.google.common.collect.Sets;
+import com.google.common.io.CharStreams;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
+import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.DayPeriodsMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.LocaleMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.PluralRangesMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.PluralsMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.RbnfMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
+import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
+import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
+
+/**
+ * The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
+ * {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
+ * class which can be invoked passing just the desired output directory and which relies on the
+ * presence of several system properties for the remainder of its parameters:
+ * <ul>
+ * <li>CLDR_DIR: The root of the CLDR release from which CLDR data is read.
+ * <li>ICU_DIR: The root of the ICU release from which additional "specials" XML data is read.
+ * <li>CLDR_DTD_CACHE: A temporary directory with the various DTDs cached (this is a legacy
+ * requirement from the underlying CLDR libraries and might go away one day).
+ * </ul>
+ */
+public final class LdmlConverter {
+ // TODO: Do all supplemental data in one go and split similarly to locale data (using RbPath).
+ private static final PathMatcher GENDER_LIST_PATHS =
+ supplementalMatcher("gender");
+ private static final PathMatcher LIKELY_SUBTAGS_PATHS =
+ supplementalMatcher("likelySubtags");
+ private static final PathMatcher METAZONE_PATHS =
+ supplementalMatcher("metaZones", "primaryZones");
+ private static final PathMatcher METADATA_PATHS =
+ supplementalMatcher("metadata");
+ private static final PathMatcher SUPPLEMENTAL_DATA_PATHS =
+ supplementalMatcher(
+ "calendarData",
+ "calendarPreferenceData",
+ "codeMappings",
+ "codeMappingsCurrency",
+ "idValidity",
+ "languageData",
+ "languageMatching",
+ "measurementData",
+ "parentLocales",
+ "subdivisionContainment",
+ "territoryContainment",
+ "territoryInfo",
+ "timeData",
+ "unitPreferenceData",
+ "weekData",
+ "weekOfPreference");
+ private static final PathMatcher CURRENCY_DATA_PATHS =
+ supplementalMatcher("currencyData");
+ private static final PathMatcher NUMBERING_SYSTEMS_PATHS =
+ supplementalMatcher("numberingSystems");
+ private static final PathMatcher WINDOWS_ZONES_PATHS =
+ supplementalMatcher("windowsZones");
+
+ // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
+ // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
+ // locale. However CLDR cannot represent this currently because calendar defaults are in
+ // supplemental data (rather than locale data) and are keyed only on territory.
+ private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
+ ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
+
+ // Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
+ // structurally valid locale ID. This is injected manually when creating the alias map.
+ // This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
+ // has been no need for that.
+ // TODO: Get "ars" into CLDR and remove this hack.
+ private static final Map<String, String> PHANTOM_ALIASES =
+ ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
+
+ private static PathMatcher supplementalMatcher(String... spec) {
+ checkArgument(spec.length > 0, "must supply at least one matcher spec");
+ if (spec.length == 1) {
+ return PathMatcher.of("supplementalData/" + spec[0]);
+ }
+ return PathMatcher.anyOf(
+ Arrays.stream(spec)
+ .map(s -> PathMatcher.of("supplementalData/" + s))
+ .toArray(PathMatcher[]::new));
+ }
+
+ private static RbPath RB_PARENT = RbPath.of("%%Parent");
+ // The quotes below are only so we achieve parity with the manually written alias files.
+ // TODO: Remove unnecessary quotes once the migration to this code is complete.
+ private static RbPath RB_ALIAS = RbPath.of("\"%%ALIAS\"");
+ // Special path for adding to empty files which only exist to complete the parent chain.
+ // TODO: Confirm that this has no meaningful effect and unify "empty" file contents.
+ private static RbPath RB_EMPTY_ALIAS = RbPath.of("___");
+
+ /** Provisional entry point until better config support exists. */
+ public static void main(String... args) {
+ convert(IcuConverterConfig.builder()
+ .setOutputDir(Paths.get(args[0]))
+ .setEmitReport(true)
+ .build());
+ }
+
+ /**
+ * Output types defining specific subsets of the ICU data which can be converted separately.
+ * This closely mimics the original "NewLdml2IcuConverter" behaviour but could be simplified to
+ * hide what are essentially implementation specific data splits.
+ */
+ public enum OutputType {
+ LOCALES(LDML, LdmlConverter::processLocales),
+ BRKITR(LDML, LdmlConverter::processBrkitr),
+ COLL(LDML, LdmlConverter::processCollation),
+ RBNF(LDML, LdmlConverter::processRbnf),
+
+ DAY_PERIODS(
+ SUPPLEMENTAL,
+ c -> c.processDayPeriods("misc")),
+ GENDER_LIST(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
+ LIKELY_SUBTAGS(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("likelySubtags", LIKELY_SUBTAGS_PATHS, "misc", false)),
+ SUPPLEMENTAL_DATA(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("supplementalData", SUPPLEMENTAL_DATA_PATHS, "misc", true)),
+ CURRENCY_DATA(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("supplementalData", CURRENCY_DATA_PATHS, "curr", true)),
+ METADATA(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("metadata", METADATA_PATHS, "misc", false)),
+ META_ZONES(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("metaZones", METAZONE_PATHS, "misc", false)),
+ NUMBERING_SYSTEMS(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
+ PLURALS(
+ SUPPLEMENTAL,
+ c -> c.processPlurals("misc")),
+ PLURAL_RANGES(
+ SUPPLEMENTAL,
+ c -> c.processPluralRanges("misc")),
+ WINDOWS_ZONES(
+ SUPPLEMENTAL,
+ c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
+ TRANSFORMS(
+ SUPPLEMENTAL,
+ c -> c.processTransforms("translit")),
+ KEY_TYPE_DATA(
+ BCP47,
+ c -> c.processKeyTypeData("misc")),
+
+ // Batching by type.
+ DTD_LDML(LDML, c -> c.processAll(LDML)),
+ DTD_SUPPLEMENTAL(SUPPLEMENTAL, c -> c.processAll(SUPPLEMENTAL)),
+ DTD_BCP47(BCP47, c -> c.processAll(BCP47));
+
+ public static final ImmutableSet<OutputType> ALL =
+ ImmutableSet.of(DTD_BCP47, DTD_SUPPLEMENTAL, DTD_LDML);
+
+ private final CldrDataType type;
+ private final Consumer<LdmlConverter> converterFn;
+
+ OutputType(CldrDataType type, Consumer<LdmlConverter> converterFn) {
+ this.type = checkNotNull(type);
+ this.converterFn = checkNotNull(converterFn);
+ }
+
+ void convert(LdmlConverter converter) {
+ converterFn.accept(converter);
+ }
+
+ CldrDataType getCldrType() {
+ return type;
+ }
+ }
+
+ private static void convert(LdmlConverterConfig config) {
+ CldrDataSupplier src = CldrDataSupplier
+ .forCldrFilesIn(config.getCldrDirectory())
+ .withDraftStatusAtLeast(config.getMinimumDraftStatus());
+ new LdmlConverter(config, src).convertAll(config);
+ }
+
+ // The configuration controlling conversion behaviour.
+ private final LdmlConverterConfig config;
+ // The supplier for all data to be converted.
+ private final CldrDataSupplier src;
+ // The set of available locale IDs.
+ // TODO: Make available IDs include specials files (or fail if specials are not available).
+ private final ImmutableSet<String> availableIds;
+ // Supplemental data available to mappers if needed.
+ private final SupplementalData supplementalData;
+ // Transformer for locale data.
+ private final PathValueTransformer localeTransformer;
+ // Transformer for supplemental data.
+ private final PathValueTransformer supplementalTransformer;
+ // Header string to go into every ICU data file.
+ private final ImmutableList<String> icuFileHeader;
+
+ private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
+ this.config = checkNotNull(config);
+ this.src = checkNotNull(src);
+ this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
+ // Sort the set of available locale IDs but add "root" at the front. This is the
+ // set of non-alias locale IDs to be processed.
+ Set<String> localeIds = new LinkedHashSet<>();
+ localeIds.add("root");
+ localeIds.addAll(
+ Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
+ localeIds.addAll(PHANTOM_LOCALE_IDS);
+ this.availableIds = ImmutableSet.copyOf(localeIds);
+
+ // Load the remaining path value transformers.
+ this.supplementalTransformer =
+ RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
+ IcuFunctions.ALGORITHM_FN,
+ IcuFunctions.DATE_FN,
+ IcuFunctions.DAY_NUMBER_FN,
+ IcuFunctions.EXP_FN,
+ IcuFunctions.YMD_FN);
+ this.localeTransformer =
+ RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
+ IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
+ this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
+ }
+
+ private void convertAll(LdmlConverterConfig config) {
+ ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
+ for (OutputType t : config.getOutputTypes()) {
+ groupByType.put(t.getCldrType(), t);
+ }
+ for (CldrDataType cldrType : groupByType.keySet()) {
+ for (OutputType t : groupByType.get(cldrType)) {
+ t.convert(this);
+ }
+ }
+ if (config.emitReport()) {
+ System.out.println("Supplemental Data Transformer=" + supplementalTransformer);
+ System.out.println("Locale Data Transformer=" + localeTransformer);
+ }
+ }
+
+ private static List<String> readLinesFromResource(String name) {
+ try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
+ return CharStreams.readLines(new InputStreamReader(in));
+ } catch (IOException e) {
+ throw new RuntimeException("cannot read resource: " + name, e);
+ }
+ }
+
+ private PathValueTransformer getLocaleTransformer() {
+ return localeTransformer;
+ }
+
+ private PathValueTransformer getSupplementalTransformer() {
+ return supplementalTransformer;
+ }
+
+ private void processAll(CldrDataType cldrType) {
+ List<OutputType> targets = Arrays.stream(OutputType.values())
+ .filter(t -> t.getCldrType().equals(cldrType))
+ .filter(t -> !t.name().startsWith("DTD_"))
+ .collect(toList());
+ for (OutputType t : targets) {
+ t.convert(this);
+ }
+ }
+
+ private Optional<CldrData> loadSpecialsData(String localeId) {
+ String expected = localeId + ".xml";
+ try (Stream<Path> files = Files.walk(config.getSpecialsDir())) {
+ Set<Path> xmlFiles = files
+ .filter(Files::isRegularFile)
+ .filter(f -> f.getFileName().toString().equals(expected))
+ .collect(Collectors.toSet());
+ return !xmlFiles.isEmpty()
+ ? Optional.of(
+ CldrDataSupplier.forCldrFiles(LDML, config.getMinimumDraftStatus(), xmlFiles))
+ : Optional.empty();
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "error processing specials directory: " + config.getSpecialsDir(), e);
+ }
+ }
+
+ private void processLocales() {
+ // TODO: Pre-load specials files to avoid repeatedly re-loading them.
+ processAndSplitLocaleFiles(
+ id -> LocaleMapper.process(
+ id, src, loadSpecialsData(id), getLocaleTransformer(), supplementalData),
+ CURR, LANG, LOCALES, REGION, UNIT, ZONE);
+ }
+
+ private void processBrkitr() {
+ processAndSplitLocaleFiles(
+ id -> BreakIteratorMapper.process(id, src, loadSpecialsData(id)), BRKITR);
+ }
+
+ private void processCollation() {
+ processAndSplitLocaleFiles(
+ id -> CollationMapper.process(id, src, loadSpecialsData(id)), COLL);
+ }
+
+ private void processRbnf() {
+ processAndSplitLocaleFiles(
+ id -> RbnfMapper.process(id, src, loadSpecialsData(id)), RBNF);
+ }
+
+ private void processAndSplitLocaleFiles(
+ Function<String, IcuData> icuFn, IcuLocaleDir... splitDirs) {
+
+ SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
+ Path baseDir = config.getOutputDir();
+
+ for (String id : config.getTargetLocaleIds(LOCALES)) {
+ // Skip "target" IDs that are aliases (they are handled later).
+ if (!availableIds.contains(id)) {
+ continue;
+ }
+ IcuData icuData = icuFn.apply(id);
+
+ ListMultimap<IcuLocaleDir, RbPath> splitPaths = LinkedListMultimap.create();
+ for (RbPath p : icuData.getPaths()) {
+ String rootName = getBaseSegmentName(p.getSegment(0));
+ splitPaths.put(LOCALE_SPLIT_INFO.getOrDefault(rootName, LOCALES), p);
+ }
+
+ // We always write base languages (even if empty).
+ boolean isBaseLanguage = !id.contains("_");
+ // Run through all directories (not just the keySet() of the split path map) since we
+ // sometimes write empty files.
+ for (IcuLocaleDir dir : splitDirs) {
+ Set<String> targetIds = config.getTargetLocaleIds(dir);
+ if (!targetIds.contains(id)) {
+ if (!splitPaths.get(dir).isEmpty()) {
+ System.out.format(
+ "target IDs for %s does not contain %s, but it has data: %s\n",
+ dir, id, splitPaths.get(dir));
+ }
+ continue;
+ }
+ Path outDir = baseDir.resolve(dir.getOutputDir());
+ IcuData splitData = new IcuData(icuData.getName(), icuData.hasFallback());
+ // The split data can still be empty for this directory, but that's expected.
+ splitPaths.get(dir).forEach(p -> splitData.add(p, icuData.get(p)));
+ // Adding a parent locale makes the data non-empty and forces it to be written.
+ supplementalData.getExplicitParentLocaleOf(splitData.getName())
+ .ifPresent(p -> splitData.add(RB_PARENT, p));
+ if (!splitData.isEmpty() || isBaseLanguage || dir.includeEmpty()) {
+ splitData.setVersion(CldrDataSupplier.getCldrVersionString());
+ write(splitData, outDir);
+ writtenLocaleIds.put(dir, id);
+ }
+ }
+ }
+
+ for (IcuLocaleDir dir : splitDirs) {
+ Path outDir = baseDir.resolve(dir.getOutputDir());
+ Set<String> targetIds = config.getTargetLocaleIds(dir);
+
+ Map<String, String> aliasMap = getAliasMap(targetIds, dir);
+ aliasMap.forEach((s, t) -> {
+ // It's only important to record which alias files are written because of forced
+ // aliases, but since it's harmless otherwise, we just do it unconditionally.
+ // Normal alias files don't affect the empty file calculation, but forced ones can.
+ writtenLocaleIds.put(dir, s);
+ writeAliasFile(s, t, outDir);
+ });
+
+ calculateEmptyFiles(writtenLocaleIds.get(dir), aliasMap.values())
+ .forEach(id -> writeEmptyFile(id, outDir, aliasMap.values()));
+ }
+ }
+
+ private Map<String, String> getAliasMap(Set<String> localeIds, IcuLocaleDir dir) {
+ // There are four reasons for treating a locale ID as an alias.
+ // 1: It contains deprecated subtags (e.g. "sr_YU", which should be "sr_Cyrl_RS").
+ // 2: It has no CLDR data but is missing a script subtag.
+ // 3: It is one of the special "phantom" alias which cannot be represented normally
+ // and must be manually mapped (e.g. legacy locale IDs which don't even parse).
+ // 4: It is a "super special" forced alias, which might replace existing aliases in
+ // some output directories.
+ Map<String, String> aliasMap = new LinkedHashMap<>();
+ for (String id : localeIds) {
+ if (PHANTOM_ALIASES.keySet().contains(id)) {
+ checkArgument(!availableIds.contains(id),
+ "phantom aliases should never be otherwise supported: %s\n"
+ + "(maybe the phantom alias can now be removed?)", id);
+ aliasMap.put(id, PHANTOM_ALIASES.get(id));
+ continue;
+ }
+ String canonicalId = supplementalData.replaceDeprecatedTags(id);
+ if (!canonicalId.equals(id)) {
+ // If the canonical form of an ID differs from the requested ID, the this is an
+ // alias, and just needs to point to the canonical ID.
+ aliasMap.put(id, canonicalId);
+ continue;
+ }
+ if (availableIds.contains(id)) {
+ // If it's canonical and supported, it's not an alias.
+ continue;
+ }
+ // If the requested locale is not supported, maximize it and alias to that.
+ String maximizedId = supplementalData.maximize(id)
+ .orElseThrow(() -> new IllegalArgumentException("unsupported locale ID: " + id));
+ // We can't alias to ourselves and we shouldn't be here is the ID was already maximal.
+ checkArgument(!maximizedId.equals(id), "unsupported maximized locale ID: %s", id);
+ aliasMap.put(id, maximizedId);
+ }
+ // Important that we overwrite entries which might already exist here, since we might have
+ // already calculated a "natural" alias for something that we want to force (and we should
+ // replace the existing target, since that affects how we determine empty files later).
+ aliasMap.putAll(config.getForcedAliases(dir));
+ return aliasMap;
+ }
+
+ private static final CharMatcher PATH_MODIFIER = CharMatcher.anyOf(":%");
+
+ // Resource bundle paths elements can have variants (e.g. "Currencies%narrow) or type
+ // annotations (e.g. "languages:intvector"). We strip these when considering the element name.
+ private static String getBaseSegmentName(String segment) {
+ int idx = PATH_MODIFIER.indexIn(segment);
+ return idx == -1 ? segment : segment.substring(0, idx);
+ }
+
+ private void processDayPeriods(String dir) {
+ write(DayPeriodsMapper.process(src), dir);
+ }
+
+ private void processPlurals(String dir) {
+ write(PluralsMapper.process(src), dir);
+ }
+
+ private void processPluralRanges(String dir) {
+ write(PluralRangesMapper.process(src), dir);
+ }
+
+ private void processKeyTypeData(String dir) {
+ Bcp47Mapper.process(src).forEach(d -> write(d, dir));
+ }
+
+ private void processTransforms(String dir) {
+ Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
+ write(TransformsMapper.process(src, transformDir), transformDir);
+ }
+
+ private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
+
+ private void processSupplemental(
+ String label, PathMatcher paths, String dir, boolean addCldrVersion) {
+ IcuData icuData =
+ SupplementalMapper.process(src, getSupplementalTransformer(), label, paths);
+ // A hack for "supplementalData.txt" since the "cldrVersion" value doesn't come from the
+ // supplemental data XML files.
+ if (addCldrVersion) {
+ icuData.add(RB_CLDR_VERSION, CldrDataSupplier.getCldrVersionString());
+ }
+ write(icuData, dir);
+ }
+
+ private void writeAliasFile(String srcId, String destId, Path dir) {
+ IcuData icuData = new IcuData(srcId, true);
+ icuData.add(RB_ALIAS, destId);
+ write(icuData, dir);
+ }
+
+ private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
+ IcuData icuData = new IcuData(id, true);
+ // TODO: Document the reason for this (i.e. why does it matter what goes into empty files?)
+ if (aliasTargets.contains(id)) {
+ icuData.setFileComment("generated alias target");
+ icuData.add(RB_EMPTY_ALIAS, "");
+ } else {
+ // These empty files only exist because the target of an alias has a parent locale
+ // which is itself not in the set of written ICU files. An "indirect alias target".
+ icuData.setVersion(CldrDataSupplier.getCldrVersionString());
+ }
+ write(icuData, dir);
+ }
+
+ private void write(IcuData icuData, String dir) {
+ write(icuData, config.getOutputDir().resolve(dir));
+ }
+
+ private void write(IcuData icuData, Path dir) {
+ createDirectory(dir);
+ IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
+ }
+
+ private Path createDirectory(Path dir) {
+ try {
+ Files.createDirectories(dir);
+ } catch (IOException e) {
+ throw new RuntimeException("cannot create directory: " + dir, e);
+ }
+ return dir;
+ }
+
+ // The set of IDs to process is:
+ // * any file that was written
+ // * any alias target (not written)
+ //
+ // From which we generate the complete "closure" under the "getParent()" function. This set
+ // contains all file (written or not) which need to exist to complete the locale hierarchy.
+ //
+ // Then we remove all the written files to just leave the ones that need to be generated.
+ // This is a simple and robust approach that handles things like "gaps" in non-aliased
+ // locale IDs, where an intermediate parent is not present.
+ private ImmutableSet<String> calculateEmptyFiles(
+ Set<String> writtenIds, Collection<String> aliasTargetIds) {
+
+ Set<String> seedIds = new HashSet<>(writtenIds);
+ seedIds.addAll(aliasTargetIds);
+ // Be nice and sort the output (makes easier debugging).
+ Set<String> allIds = new TreeSet<>();
+ for (String id : seedIds) {
+ while (!id.equals("root") && !allIds.contains(id)) {
+ allIds.add(id);
+ id = supplementalData.getParent(id);
+ }
+ }
+ return ImmutableSet.copyOf(Sets.difference(allIds, writtenIds));
+ }
+
+ private static final ImmutableMap<String, IcuLocaleDir> LOCALE_SPLIT_INFO =
+ ImmutableMap.<String, IcuLocaleDir>builder()
+ // BRKITR
+ .put("boundaries", BRKITR)
+ .put("dictionaries", BRKITR)
+ .put("exceptions", BRKITR)
+ // COLL
+ .put("collations", COLL)
+ .put("depends", COLL)
+ .put("UCARules", COLL)
+ // CURR
+ .put("Currencies", CURR)
+ .put("CurrencyPlurals", CURR)
+ .put("CurrencyUnitPatterns", CURR)
+ .put("currencySpacing", CURR)
+ // LANG
+ .put("Keys", LANG)
+ .put("Languages", LANG)
+ .put("Scripts", LANG)
+ .put("Types", LANG)
+ .put("Variants", LANG)
+ .put("characterLabelPattern", LANG)
+ .put("codePatterns", LANG)
+ .put("localeDisplayPattern", LANG)
+ // RBNF
+ .put("RBNFRules", RBNF)
+ // REGION
+ .put("Countries", REGION)
+ // UNIT
+ .put("durationUnits", UNIT)
+ .put("units", UNIT)
+ .put("unitsShort", UNIT)
+ .put("unitsNarrow", UNIT)
+ // ZONE
+ .put("zoneStrings", ZONE)
+ .build();
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import java.nio.file.Path;
+import java.util.Map;
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrDraftStatus;
+
+import com.google.common.base.Ascii;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
+
+/** API for configuring the LDML converter. */
+public interface LdmlConverterConfig {
+ /** Output directories for ICU locale data (this is not used for supplemental data). */
+ enum IcuLocaleDir {
+ /** Data for the break-iterator library. */
+ BRKITR(true),
+ /** Data for the collations library. */
+ COLL(true),
+ /** Currency data. */
+ CURR(false),
+ /** Language data. */
+ LANG(false),
+ /** General locale data. */
+ LOCALES(true),
+ /** Rule-based number formatter data. */
+ RBNF(true),
+ /** Region data. */
+ REGION(false),
+ /** Measurement and units data. */
+ UNIT(false),
+ /** Timezone data. */
+ ZONE(false);
+
+ private final String dirName = Ascii.toLowerCase(name());
+ private final boolean includeEmpty;
+
+ IcuLocaleDir(boolean includeEmpty) {
+ this.includeEmpty = includeEmpty;
+ }
+
+ /** Returns the relative output directory name. */
+ String getOutputDir() {
+ return dirName;
+ }
+
+ /**
+ * Whether the directory is expected to contain empty data files (used to advertise
+ * the supported set of locales for the "service" provided by the data in that
+ * directory).
+ */
+ // TODO: Document why there's a difference between directories for empty directories.
+ boolean includeEmpty() {
+ return includeEmpty;
+ }
+ }
+
+ /**
+ * Returns the set of output types to be converted. Use {@link OutputType#ALL} to convert
+ * everything.
+ */
+ Set<OutputType> getOutputTypes();
+
+ /** Returns the root directory in which the CLDR release is located. */
+ Path getCldrDirectory();
+
+ /**
+ * Returns an additional "specials" directory containing additional ICU specific XML
+ * files depending on the given output type. This is where the converter finds any XML
+ * files using the "icu:" namespace.
+ */
+ Path getSpecialsDir();
+
+ /**
+ * Returns the root of the ICU output directory hierarchy into which ICU data file are
+ * written.
+ */
+ Path getOutputDir();
+
+ /** Returns the minimal draft status for CLDR data to be converted. */
+ CldrDraftStatus getMinimumDraftStatus();
+
+ /**
+ * Returns the set of locale IDs to be processed for the given directory.
+ *
+ * <p>This set can contain IDs which have noICU data associated with them if they are
+ * suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
+ * exist).
+ */
+ Set<String> getTargetLocaleIds(IcuLocaleDir dir);
+
+ /**
+ * Return a map of locale IDs which specifies aliases which are applied to the given
+ * directory in contradiction to the natural alias or parent ID which would otherwise
+ * be generated. This is a mechanism for restructuring the parent chain and linking
+ * locales together in non-standard and unexpected ways.
+ */
+ Map<String, String> getForcedAliases(IcuLocaleDir dir);
+
+ /**
+ * Whether to emit a summary report for debug purposes after conversion is complete.
+ */
+ boolean emitReport();
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkPositionIndex;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.function.Predicate;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * An immutable matcher for {@link CldrPath} instances. A path matcher specification looks like
+ * {@code "foo/*[@x="z"]/bar[@y=*]"}, where element names and attribute values can be wildcards.
+ *
+ * <p>Note that the path fragment represented by the specification does not include either leading
+ * or trailing {@code '/'}. This is because matching can occur at any point in a {@code CdlrPath}.
+ * The choice of where to match in the path is governed by the match method used (e.g.
+ * {@link PathMatcher#matchesSuffixOf(CldrPath)}.
+ */
+public abstract class PathMatcher {
+ /** Parses the path specification into a matcher. */
+ public static PathMatcher of(String pathSpec) {
+ // Supported so far: "a", "a/b", "a/b[@x=*]"
+ return new BasicMatcher(parse(pathSpec));
+ }
+
+ /**
+ * Combines the given matchers into a single composite matcher which tests all the given
+ * matchers in order.
+ */
+ public static PathMatcher anyOf(PathMatcher... matchers) {
+ checkArgument(matchers.length > 0, "must supply at least one matcher");
+ if (matchers.length == 1) {
+ return checkNotNull(matchers[0]);
+ }
+ return new CompositeMatcher(ImmutableList.copyOf(matchers));
+ }
+
+ /** Attempts a full match against a given path. */
+ public abstract boolean matches(CldrPath path);
+
+ /** Attempts a suffix match against a given path. */
+ public abstract boolean matchesSuffixOf(CldrPath path);
+
+ /** Attempts a prefix match against a given path. */
+ public abstract boolean matchesPrefixOf(CldrPath path);
+
+ // A matcher that simply combines a sequences of other matchers in order.
+ private static final class CompositeMatcher extends PathMatcher {
+ private final ImmutableList<PathMatcher> matchers;
+
+ private CompositeMatcher(ImmutableList<PathMatcher> matchers) {
+ checkArgument(matchers.size() > 1);
+ this.matchers = checkNotNull(matchers);
+ }
+
+ @Override
+ public boolean matches(CldrPath path) {
+ for (PathMatcher m : matchers) {
+ if (m.matches(path)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public boolean matchesSuffixOf(CldrPath path) {
+ for (PathMatcher m : matchers) {
+ if (m.matchesSuffixOf(path)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public boolean matchesPrefixOf(CldrPath path) {
+ for (PathMatcher m : matchers) {
+ if (m.matchesPrefixOf(path)) {
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+ private static final class BasicMatcher extends PathMatcher {
+ private final ImmutableList<Predicate<CldrPath>> elementMatchers;
+
+ private BasicMatcher(List<Predicate<CldrPath>> elementMatchers) {
+ this.elementMatchers = ImmutableList.copyOf(elementMatchers);
+ }
+
+ @Override
+ public boolean matches(CldrPath path) {
+ return elementMatchers.size() == path.getLength() && matchRegion(path, 0);
+ }
+
+ @Override
+ public boolean matchesSuffixOf(CldrPath path) {
+ int start = path.getLength() - elementMatchers.size();
+ return start >= 0 && matchRegion(path, start);
+ }
+
+ @Override
+ public boolean matchesPrefixOf(CldrPath path) {
+ return path.getLength() >= elementMatchers.size() && matchRegion(path, 0);
+ }
+
+ private boolean matchRegion(CldrPath path, int offset) {
+ // offset is the path element corresponding the the "top most" element matcher, it
+ // must be in the range 0 ... (path.length() - elementMatchers.size()).
+ checkPositionIndex(offset, path.getLength() - elementMatchers.size());
+ // First jump over the path parents until we find the last matcher.
+ int matchPathLength = offset + elementMatchers.size();
+ while (path.getLength() > matchPathLength) {
+ path = path.getParent();
+ }
+ return matchForward(path, elementMatchers.size() - 1);
+ }
+
+ private boolean matchForward(CldrPath path, int matcherIndex) {
+ if (matcherIndex < 0) {
+ return true;
+ }
+ return matchForward(path.getParent(), matcherIndex - 1)
+ && elementMatchers.get(matcherIndex).test(path);
+ }
+ }
+
+ // Make a new, non-interned, unique instance here which we can test by reference to
+ // determine if the argument is to be captured (needed as ImmutableMap prohibits null).
+ // DO NOT change this code to assign "*" as the value directly, it MUST be a new instance.
+ private static final String WILDCARD = new String("*");
+
+ private static final Pattern ELEMENT_START_REGEX =
+ Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)");
+ private static final Pattern ATTRIBUTE_REGEX =
+ Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")\\]");
+
+ // element := foo, foo[@bar="baz"], foo[@bar=*]
+ // pathspec := element{/element}*
+ private static List<Predicate<CldrPath>> parse(String pathSpec) {
+ List<Predicate<CldrPath>> specs = new ArrayList<>();
+ int pos = 0;
+ do {
+ pos = parse(pathSpec, pos, specs);
+ } while (pos >= 0);
+ return specs;
+ }
+
+ // Return next start index or -1.
+ private static int parse(String pathSpec, int pos, List<Predicate<CldrPath>> specs) {
+ Matcher m = ELEMENT_START_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
+ checkArgument(m.lookingAt(), "invalid path specification (index=%s): %s", pos, pathSpec);
+ String name = m.group(1);
+ Map<String, String> attributes = ImmutableMap.of();
+ pos = m.end(1);
+ if (pos < pathSpec.length() && pathSpec.charAt(pos) == '[') {
+ // We have attributes to add.
+ attributes = new LinkedHashMap<>();
+ do {
+ m = ATTRIBUTE_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
+ checkArgument(m.lookingAt(),
+ "invalid path specification (index=%s): %s", pos, pathSpec);
+ // Null if we matched the '*' wildcard.
+ String value = m.group(2);
+ attributes.put(m.group(1), value != null ? value : WILDCARD);
+ pos = m.end();
+ } while (pos < pathSpec.length() && pathSpec.charAt(pos) == '[');
+ }
+ // Wildcard matching is less efficient because attribute keys cannot be made in advance, so
+ // since it's also very rare, we special case it.
+ Predicate<CldrPath> matcher = name.equals(WILDCARD)
+ ? new WildcardElementMatcher(attributes)::match
+ : new ElementMatcher(name, attributes)::match;
+ specs.add(matcher);
+ if (pos == pathSpec.length()) {
+ return -1;
+ }
+ checkState(pathSpec.charAt(pos) == '/',
+ "invalid path specification (index=%s): %s", pos, pathSpec);
+ return pos + 1;
+ }
+
+ // Matcher for path elements like "foo[@bar=*]" where the name is known in advance.
+ private static final class ElementMatcher {
+ private final String name;
+ private final ImmutableMap<AttributeKey, String> attributes;
+
+ private ElementMatcher(String name, Map<String, String> attributes) {
+ this.name = checkNotNull(name);
+ this.attributes = attributes.entrySet().stream()
+ .collect(toImmutableMap(e -> keyOf(name, e.getKey()), Entry::getValue));
+ }
+
+ boolean match(CldrPath path) {
+ if (!path.getName().equals(name)) {
+ return false;
+ }
+ for (Entry<AttributeKey, String> e : attributes.entrySet()) {
+ String actual = path.get(e.getKey());
+ if (actual == null) {
+ return false;
+ }
+ String expected = e.getValue();
+ // DO NOT change this to use expected.equals(WILDCARD).
+ if (expected != WILDCARD && !expected.equals(actual)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ // Matcher for path elements like "*[@bar=*]", where the name isn't known until match time.
+ private static final class WildcardElementMatcher {
+ private final ImmutableMap<String, String> attributes;
+
+ private WildcardElementMatcher(Map<String, String> attributes) {
+ this.attributes = ImmutableMap.copyOf(attributes);
+ }
+
+ private boolean match(CldrPath path) {
+ // The wildcard matcher never fails due to the element name but must create new key
+ // instances every time matching occurs (because the key name is dynamic). Since this
+ // is rare, it's worth making into a separate case.
+ for (Entry<String, String> attribute : attributes.entrySet()) {
+ String actual = path.get(keyOf(path.getName(), attribute.getKey()));
+ if (actual == null) {
+ return false;
+ }
+ String expected = attribute.getValue();
+ // DO NOT change this to use expected.equals(WILDCARD).
+ if (expected != WILDCARD && !expected.equals(actual)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.function.Function;
+
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * API for transforming CLDR path/value pairs. Transformed results support grouping by their key
+ * and the ability to generate default "fallback" values to account for missing values in a group.
+ *
+ * <p>To transform some set of CLDR path/values:
+ * <ol>
+ * <li>Transform all desired path/value pairs into a set of matched results, discarding duplicates
+ * (see {@link #transform(CldrValue)}.
+ * <li>Group the results by key (e.g. into a {@code ListMultimap}).
+ * <li>For each group, add any fallback values which don't yet exist for that key (see
+ * {@link #getFallbackResultsFor(RbPath, DynamicVars)} and {@link Result#isFallbackFor(Result)}).
+ * <li>Sort elements within each group and flatten result values (see {@link Result#isGrouped()}).
+ * </ol>
+ *
+ * <p>For each unique key, this should yield correctly ordered sequence of values (according to the
+ * semantics of the chosen transformer implementation).
+ */
+public abstract class PathValueTransformer {
+ /**
+ * A result either obtained by transforming a path/value pair, or as a potential fallback for
+ * some known key (see {@link PathValueTransformer#transform(CldrValue)} and
+ * {@link PathValueTransformer#getFallbackResultsFor(RbPath, DynamicVars)}).
+ */
+ public static abstract class Result implements Comparable<Result> {
+ private final RbPath key;
+
+ protected Result(RbPath key) {
+ this.key = checkNotNull(key);
+ }
+
+ /**
+ * Returns the key of this result, used to group results and determine fallback values
+ * according to the semantics of the chosen transformer.
+ */
+ public RbPath getKey() {
+ return key;
+ }
+
+ /**
+ * Returns whether the values in this result should be grouped or not. Un-grouped values
+ * should be considered as individual values in a sequence and might be joined with values
+ * from other results in the same group. Grouped values cannot be split and must appear
+ * as a single value.
+ *
+ * <p>For example for the ordered results:
+ * <pre>
+ * Result X = { key=K, values=[ "a", "b" ], grouped=false }
+ * Result Y = { key=K, values=[ "c", "d" ], grouped=false }
+ * Result Z = { key=K, values=[ "e" ], grouped=false }
+ * </pre>
+ * the values for key {@code K} are conceptually {@code [ "a", "b", "c", "d", "e" ]}.
+ *
+ * <p>However if result {@code Y} has {@code grouped=true} then there are now 4 values
+ * {@code [ "a", "b", ["c", "d"], "e" ]}, and if {@code X} is also grouped, then it is
+ * {@code [ ["a", "b"], ["c", "d"], "e" ]}, producing only 3 top-level values.
+ */
+ public abstract boolean isGrouped();
+
+ /**
+ * Returns the transformed values of this result, which may or may not be grouped
+ * according to {@link #isGrouped()}.
+ */
+ public abstract ImmutableList<String> getValues();
+
+ /**
+ * Returns whether this result is a fallback for some existing matched result. Fallback
+ * results should only be used when it is not a fallback for any existing result.
+ */
+ public abstract boolean isFallbackFor(Result r);
+
+ /** Debug only string representation. */
+ @Override
+ public final String toString() {
+ return String.format(
+ "Result{ key='%s', grouped=%s, values=%s }",
+ getKey(), isGrouped(), getValues());
+ }
+ }
+
+ /**
+ * A "typedef" for the function to do late binding of dynamic variables. This is used for edge
+ * cases where a %N variable in the rules config is bound to a CLDR path (e.g. "//foo/bar")
+ * which cannot be resolved until the rule is evaluated. Unfortunately the need to support late
+ * binding of variables incurs significant additional complexity in the code, despite being
+ * used in exactly one situation so far (the '%D' variable to represent the default numbering
+ * scheme.
+ */
+ // TODO: Figure out how to get rid of all of this mess.
+ public interface DynamicVars extends Function<CldrPath, String> {}
+
+ /**
+ * Transforms a CLDR value into a sequence of results (empty if the value was not matched by
+ * any rule).
+ *
+ * @param cldrValue the value to transform.
+ * @return the transformed result(s).
+ */
+ public abstract ImmutableList<Result> transform(CldrValue cldrValue);
+
+ /**
+ * Transforms a CLDR value into a sequence of results (empty if the value was not matched by
+ * any rule). The dynamic variable function provides any "late bound" CLDR path variables to be
+ * resolved from CLDR data during processing (e.g "%D=//ldml/numbers/defaultNumberingSystem").
+ *
+ * @param cldrValue the value to transform.
+ * @param varFn a function for resolving "late bound" variables.
+ * @return the transformed result(s).
+ */
+ public abstract ImmutableList<Result> transform(CldrValue cldrValue, DynamicVars varFn);
+
+ /**
+ * Returns a possibly empty sequence of fallback results for a given key. A fallback result for
+ * a key should be used only if it is not a fallback for any other result with that key; see
+ * also {@link Result#isFallbackFor(Result)}.
+ */
+ public abstract ImmutableList<Result> getFallbackResultsFor(RbPath key, DynamicVars varFn);
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Objects;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Comparators;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+/**
+ * A resource bundle path, used to identify entries in ICU data.
+ *
+ * <p>Immutable and thread safe.
+ */
+public final class RbPath implements Comparable<RbPath> {
+ private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults();
+
+ // This defines ordering of paths in IcuData instances and thus the order in ICU data files.
+ // If there's ever a reason to have a different "natural" order for paths, this Comparator
+ // should be moved into the ICU file writer class(es).
+ private static final Comparator<RbPath> ORDERING =
+ Comparator.comparing(
+ p -> p.segments,
+ Comparators.lexicographical(Comparator.<String>naturalOrder()));
+
+ // Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much
+ // faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs).
+ private static final CharMatcher INVARIANT_CHARS =
+ CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate());
+
+ // Note that we must also prohibit double-quote from appearing anywhere other than surrounding
+ // segment values. This is because some segment values can contain special ICU data characters
+ // (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU
+ // data for key values (since '\' is not an invariant, things like \\uxxxx are not possible).
+ //
+ // Ideally quoting would be done when the file is written, but that would require additional
+ // complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow
+ // be distinguished from timezone "metazone" names which also contain ':'.
+ private static final CharMatcher QUOTED_SEGMENT_CHARS =
+ INVARIANT_CHARS
+ .and(CharMatcher.javaIsoControl().negate())
+ .and(CharMatcher.isNot('"'));
+ private static final CharMatcher UNQUOTED_SEGMENT_CHARS =
+ QUOTED_SEGMENT_CHARS.and(whitespace().negate());
+
+ // Characters allowed in path segments which separate the "base name" from any suffix (e.g.
+ // the base name of "Foo:intvector" is "Foo").
+ private static final CharMatcher SEGMENT_SEPARATORS = CharMatcher.anyOf("%:");
+
+ private static final RbPath EMPTY = new RbPath(ImmutableList.of());
+
+ public static RbPath empty() {
+ return EMPTY;
+ }
+
+ public static RbPath of(String... segments) {
+ return of(Arrays.asList(segments));
+ }
+
+ public static RbPath of(Iterable<String> segments) {
+ return new RbPath(segments);
+ }
+
+ public static RbPath parse(String path) {
+ checkArgument(!path.isEmpty(), "cannot parse an empty path string");
+ // Allow leading '/', but don't allow empty segments anywhere else.
+ if (path.startsWith("/")) {
+ path = path.substring(1);
+ }
+ return new RbPath(PATH_SPLITTER.split(path));
+ }
+
+ static int getCommonPrefixLength(RbPath lhs, RbPath rhs) {
+ int maxLength = Math.min(lhs.length(), rhs.length());
+ int n = 0;
+ while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) {
+ n++;
+ }
+ return n;
+ }
+
+ private final ImmutableList<String> segments;
+ private final int hashCode;
+
+ private RbPath(Iterable<String> segments) {
+ this.segments = ImmutableList.copyOf(segments);
+ this.hashCode = Objects.hash(this.segments);
+ for (String segment : this.segments) {
+ checkArgument(!segment.isEmpty(),
+ "empty path segments not permitted: %s", this.segments);
+ // Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only
+ // contain double quotes at either end, or not at all. If the string is quoted, only
+ // validate the content, and not the quotes themselves.
+ String toValidate;
+ switch (segment.charAt(0)) {
+ case '<':
+ // Allow anything in hidden labels, since they will be removed later and never
+ // appear in the final ICU data.
+ checkArgument(segment.endsWith(">"),
+ "mismatched quoting for hidden label: %s", segment);
+ continue;
+
+ case '"':
+ checkArgument(segment.endsWith("\""),
+ "mismatched quoting for segment: %s", segment);
+ checkArgument(
+ QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)),
+ "invalid character in unquoted resource bundle path segment: %s", segment);
+ break;
+
+ default:
+ checkArgument(
+ UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment),
+ "invalid character in unquoted resource bundle path segment: %s", segment);
+ break;
+ }
+ }
+ }
+
+ public int length() {
+ return segments.size();
+ }
+
+ public String getSegment(int n) {
+ return segments.get(n);
+ }
+
+ public RbPath getParent() {
+ checkState(length() > 0, "cannot get parent of the empty path");
+ return length() > 1 ? new RbPath(segments.subList(0, length() - 1)) : EMPTY;
+ }
+
+ public boolean isAnonymous() {
+ return length() > 0 && segments.get(length() - 1).charAt(0) == '<';
+ }
+
+ public RbPath extendBy(String... parts) {
+ return new RbPath(Iterables.concat(segments, Arrays.asList(parts)));
+ }
+
+ public RbPath extendBy(RbPath suffix) {
+ return new RbPath(Iterables.concat(segments, suffix.segments));
+ }
+
+ public RbPath mapSegments(Function<? super String, String> fn) {
+ return new RbPath(segments.stream().map(fn).collect(toImmutableList()));
+ }
+
+ /**
+ * Returns whether the first element of this path is prefix by the given "base name".
+ *
+ * <p>Resource bundle paths relating to semantically similar data are typically grouped by the
+ * same first path element. This is not as simple as just comparing the first element, as in
+ * {@code path.startsWith(prefix)} however, since path elements can have suffixes, such as
+ * {@code "Foo:alias"} or {@code "Foo%subtype"}.
+ *
+ * @param baseName the base name to test for.
+ * @return true is the "base name" of the first path element is the given prefix.
+ */
+ public boolean hasPrefix(String baseName) {
+ checkArgument(!baseName.isEmpty() && SEGMENT_SEPARATORS.matchesNoneOf(baseName));
+ if (length() == 0) {
+ return false;
+ }
+ String firstElement = getSegment(0);
+ // Slightly subtle (but safe) access to the separator character, since:
+ // (!a.equals(b) && a.startsWith(b)) ==> a.length() > b.length().
+ return firstElement.equals(baseName)
+ || (firstElement.startsWith(baseName)
+ && SEGMENT_SEPARATORS.matches(firstElement.charAt(baseName.length())));
+ }
+
+ public boolean startsWith(RbPath prefix) {
+ return prefix.length() <= length() && matchesSublist(prefix, 0);
+ }
+
+ public boolean endsWith(RbPath suffix) {
+ return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length());
+ }
+
+ public boolean contains(RbPath path) {
+ int maxOffset = length() - path.length();
+ for (int i = 0; i <= maxOffset; i++) {
+ if (matchesSublist(path, i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Assume length check has been done.
+ private boolean matchesSublist(RbPath path, int offset) {
+ for (int i = 0; i < path.length(); i++) {
+ if (!path.getSegment(i).equals(getSegment(i + offset))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ boolean isIntPath() {
+ String lastElement = segments.get(segments.size() - 1);
+ return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
+ }
+
+ @Override public int compareTo(RbPath other) {
+ return ORDERING.compare(this, other);
+ }
+
+ @Override public boolean equals(Object other) {
+ return (other instanceof RbPath) && segments.equals(((RbPath) other).segments);
+ }
+
+ @Override public int hashCode() {
+ return hashCode;
+ }
+
+ @Override public String toString() {
+ return String.join("/", segments);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.Function;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * A resource bundle value containing a sequence of elements. This is a very thin wrapper over an
+ * immutable list, with a few additional constraints (e.g. cannot be empty).
+ *
+ * <p>Immutable and thread safe.
+ */
+public final class RbValue {
+ private final ImmutableList<String> elements;
+
+ /** Returns a resource bundle value of the given elements. */
+ public static RbValue of(String... elements) {
+ return of(Arrays.asList(elements));
+ }
+
+ /** Returns a resource bundle value of the given elements. */
+ public static RbValue of(Iterable<String> elements) {
+ return new RbValue(elements);
+ }
+
+ private RbValue(Iterable<String> elements) {
+ this.elements = ImmutableList.copyOf(elements);
+ checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty");
+ }
+
+ /** Returns the (non zero) number of elements in this value. */
+ public int size() {
+ return elements.size();
+ }
+
+ /** Returns the Nth element of this value. */
+ public String getElement(int n) {
+ return elements.get(n);
+ }
+
+ @Override public int hashCode() {
+ return Objects.hashCode(elements);
+ }
+
+ @Override public boolean equals(Object obj) {
+ return obj instanceof RbValue && elements.equals(((RbValue) obj).elements);
+ }
+
+ @Override public String toString() {
+ return elements.toString();
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static java.util.function.Function.identity;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+
+import com.google.common.base.Ascii;
+import com.google.common.base.Splitter;
+import com.google.common.base.Strings;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableTable;
+import com.google.common.collect.Table;
+
+/**
+ * Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
+ * in some mapper classes.
+ *
+ * When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
+ * build using the same underlying CLDR data. The only reason mapper classes do not create their
+ * own instances directly is the relative cost of processing all the supplemental data each time.
+ */
+// TODO: This should be moved into the API and leverage some of the existing utility functions.
+public final class SupplementalData {
+ private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
+
+ private static final PathMatcher ALIAS =
+ PathMatcher.of("supplementalData/metadata/alias/*[@type=*]");
+
+ private static final PathMatcher PARENT_LOCALE =
+ PathMatcher.of("supplementalData/parentLocales/parentLocale[@parent=*]");
+ private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
+ private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
+
+ private static final PathMatcher CALENDER_PREFERENCE =
+ PathMatcher.of("supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
+ private static final AttributeKey CALENDER_TERRITORIES =
+ keyOf("calendarPreference", "territories");
+ private static final AttributeKey CALENDER_ORDERING =
+ keyOf("calendarPreference", "ordering");
+
+ private static final PathMatcher LIKELY_SUBTAGS =
+ PathMatcher.of("supplementalData/likelySubtags/likelySubtag[@from=*]");
+ private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
+ private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
+
+ private static final Splitter LIST_SPLITTER =
+ Splitter.on(whitespace()).omitEmptyStrings();
+
+ // Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
+ // a single value (it's structurally always a list, but only territory aliases have a need for
+ // more than one value).
+ private enum Alias {
+ LANGUAGE, SCRIPT, TERRITORY;
+
+ private static final ImmutableMap<String, Alias> TYPE_MAP =
+ Arrays.stream(values())
+ .collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
+
+ private final String elementName = Ascii.toLowerCase(name()) + "Alias";
+ final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
+ final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
+
+ static Optional<Alias> forElementName(String name) {
+ return Optional.ofNullable(TYPE_MAP.get(name));
+ }
+ }
+
+ /**
+ * Creates a supplemental data API instance from the given CLDR data.
+ *
+ * @param supplementalData the raw CLDR supplemental data instance.
+ * @return the supplemental data API.
+ */
+ static SupplementalData create(CldrData supplementalData) {
+ Table<Alias, String, String> aliasTable = HashBasedTable.create();
+ Map<String, String> parentLocaleMap = new HashMap<>();
+ Map<String, String> defaultCalendarMap = new HashMap<>();
+ Map<String, String> likelySubtagMap = new HashMap<>();
+
+ supplementalData.accept(
+ ARBITRARY,
+ v -> {
+ if (ALIAS.matches(v.getPath())) {
+ // Territory alias replacements can be a list of values (e.g. when countries
+ // break up). We use the first (geo-politically most significant) value. This
+ // doesn't happen for languages or scripts, but could in theory.
+ Alias.forElementName(v.getPath().getName()).ifPresent(
+ alias -> aliasTable.put(
+ alias,
+ alias.typeKey.valueFrom(v),
+ alias.replacementKey.valueFrom(v)));
+ } else if (PARENT_LOCALE.matches(v.getPath())) {
+ String p = PARENT.valueFrom(v);
+ LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
+ } else if (CALENDER_PREFERENCE.matches(v.getPath())) {
+ String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
+ CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
+ } else if (LIKELY_SUBTAGS.matches(v.getPath())) {
+ likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
+ }
+ });
+
+ // WARNING: The original mapper code determines the full set of deprecated territories and
+ // then removes the following hard-coded list without any explanation as to why. While this
+ // is presumably to "undeprecate" them for the purposes of the locale processing, there's
+ // no explanation of where this list comes from, and thus no way to maintain it.
+ //
+ // asList("062", "172", "200", "830", "AN", "CS", "QU")
+ // .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
+ // TODO: Understand and document what on Earth this is all about or delete this comment.
+
+ return new SupplementalData(
+ aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
+ }
+
+ // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
+ // data generation. Because this is mutable, it is thoroughly unsuitable for general use.
+ private static final class LocaleId {
+ // From: https://unicode.org/reports/tr35/#Identifiers
+ // Locale ID is:
+ // (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
+ //
+ // However in CLDR data, there's always a language (even if it's "und"), and never more
+ // than one variant, so this can be simplified to:
+ // <language>(_<script>)?(_<region>)?(_<variant>)?
+ //
+ // * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
+ // Note that the specification allows for languages 5-8 characters long, but in reality
+ // this has never occurred yet, so it's ignored in this code.
+ //
+ // * Script is 4-letter Xxxx script identifier (e.g. "Latn").
+ // The specification permits any casing for script subtags, but since all the data uses
+ // the capitalized "Xxxx" form, that's what this code expects.
+ //
+ // * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
+ // identifier (e.g. "001").
+ //
+ // * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
+ // with a digit (this avoids any ambiguity with script subtags). However because ICU
+ // violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
+ // merely "longer than 5".
+ //
+ // Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
+ // for either '-' or '_').
+ //
+ // The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
+ private static final Pattern LOCALE_ID =
+ Pattern.compile("([a-z]{2,3})"
+ + "(?:_([A-Z][a-z]{3}))?"
+ + "(?:_([A-Z]{2}|[0-9]{3}))?"
+ + "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
+
+ static LocaleId parse(String localeId) {
+ Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
+ checkArgument(m.matches(), "invalid locale ID: %s", localeId);
+ return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
+ }
+
+ static LocaleId of(String language, String script, String region) {
+ return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
+ }
+
+ // Only the language subtag is non-nullable.
+ private String languageSubtag;
+ private String scriptSubtag;
+ private String regionSubtag;
+ private String variantSubtag;
+
+ String getLanguage() {
+ return languageSubtag;
+ }
+
+ String getScript() {
+ return scriptSubtag;
+ }
+
+ String getRegion() {
+ return regionSubtag;
+ }
+
+ String getVariant() {
+ return variantSubtag;
+ }
+
+ LocaleId setLanguage(String languageSubtag) {
+ checkNotNull(languageSubtag, "language subtag must not be null");
+ checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
+ this.languageSubtag = languageSubtag;
+ return this;
+ }
+
+ LocaleId setScript(String scriptSubtag) {
+ this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
+ return this;
+ }
+
+ LocaleId setRegion(String regionSubtag) {
+ this.regionSubtag = Strings.emptyToNull(regionSubtag);
+ return this;
+ }
+
+ LocaleId setVariant(String variantSubtag) {
+ this.variantSubtag = Strings.emptyToNull(variantSubtag);
+ return this;
+ }
+
+ @Override public String toString() {
+ StringBuilder id = new StringBuilder(languageSubtag);
+ if (scriptSubtag != null) {
+ id.append("_").append(scriptSubtag);
+ }
+ if (regionSubtag != null) {
+ id.append("_").append(regionSubtag);
+ }
+ if (variantSubtag != null) {
+ id.append("_").append(variantSubtag);
+ }
+ return id.toString();
+ }
+
+ @Override public boolean equals(Object o) {
+ if (!(o instanceof LocaleId)) {
+ return false;
+ }
+ LocaleId other = (LocaleId) o;
+ return Objects.equals(languageSubtag, other.languageSubtag)
+ && Objects.equals(scriptSubtag, other.scriptSubtag)
+ && Objects.equals(regionSubtag, other.regionSubtag)
+ && Objects.equals(variantSubtag, other.variantSubtag);
+ }
+
+ @Override public int hashCode() {
+ return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
+ }
+ }
+
+ private final ImmutableTable<Alias, String, String> aliasTable;
+ private final ImmutableMap<String, String> parentLocaleMap;
+ private final ImmutableMap<String, String> defaultCalendarMap;
+ private final ImmutableMap<String, String> likelySubtagMap;
+
+ private SupplementalData(
+ Table<Alias, String, String> aliasTable,
+ Map<String, String> parentLocaleMap,
+ Map<String, String> defaultCalendarMap,
+ Map<String, String> likelySubtagMap) {
+ this.aliasTable = ImmutableTable.copyOf(aliasTable);
+ this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
+ this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
+ this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
+ }
+
+ /**
+ * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
+ */
+ public Optional<String> maximize(String localeId) {
+ return addLikelySubtags(localeId).map(Object::toString);
+ }
+
+ /**
+ * Returns the locale ID with any deprecated elements replaced. This is an
+ * implementation of the algorithm specified in
+ * <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
+ * specification</a> but without any "minimizing" of the final result (as happens for
+ * canonicalization in the CLDR tools).
+ */
+ public String replaceDeprecatedTags(String localeId) {
+ if (localeId.equals("root")) {
+ return localeId;
+ }
+ LocaleId id = LocaleId.parse(localeId);
+
+ // ---- LDML Specification ----
+ // If the region subtag matches the type attribute of a territoryAlias element in
+ // Supplemental Data, replace the region subtag with the replacement value, as follows:
+ //
+ // * If there is a single territory in the replacement, use it.
+ // * If there are multiple territories:
+ // * Look up the most likely territory for the base language code (and script, if there
+ // is one).
+ // * If that likely territory is in the list, use it.
+ // * Otherwise, use the first territory in the list.
+ // ----
+ // However there is a footnote that says:
+ // Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
+ // However, there are a small number of cases of multiple territories, so the mappings
+ // can be precomputed. This results in a faster lookup with a very small subset of the
+ // likely subtags data.
+ //
+ // Note that (contrary to the order implied by the LDML specification) this step is
+ // performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
+ // work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
+ // "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
+ //
+ // TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
+ if (id.getRegion() != null) {
+ String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
+ if (replacementRegions != null) {
+ List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
+ checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
+ if (regions.size() == 1) {
+ id.setRegion(regions.get(0));
+ } else {
+ LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
+ String likelyId = likelySubtagMap.get(key.toString());
+ if (likelyId == null) {
+ likelyId = likelySubtagMap.get(key.setScript(null).toString());
+ }
+ String likelyRegion =
+ likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
+ if (regions.contains(likelyRegion)) {
+ id.setRegion(likelyRegion);
+ } else {
+ id.setRegion(regions.get(0));
+ }
+ }
+ }
+ }
+
+ // While it's not mentioned in the LDML specification, there is data in the alias table for
+ // replacement scripts (currently it contains exactly one entry with one value). Because
+ // its not clear if this is intended to only be single values or a list (and how to handle
+ // it if it were a list), there's a hard check to ensure it's only ever a single value.
+ if (id.getScript() != null) {
+ String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
+ if (replacementScript != null) {
+ checkArgument(whitespace().matchesNoneOf(replacementScript),
+ "unexpected list of replacement scripts: %s", replacementScript);
+ id.setScript(replacementScript);
+ }
+ }
+
+ // ---- LDML Specification ----
+ // If the language subtag matches the type attribute of a languageAlias element in
+ // Supplemental Data, replace the language subtag with the replacement value.
+ //
+ // If there are additional subtags in the replacement value, add them to the result, but
+ // only if there is no corresponding subtag already in the tag.
+ // ----
+ // Contrary to the precise wording of the specification, we don't just check the language
+ // subtag, since language aliases can contain script and even region information. Instead
+ // we check the alias table using the same order as defined in subtag maximizing:
+ //
+ // <language>_<script>_<region>
+ // <language>_<region>
+ // <language>_<script>
+ // <language>
+ //
+ // There is no need to check for "und" however since that's not aliased anything, but since
+ // it shares the same code it's harmless to do.
+ resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
+ .ifPresent(resolvedId -> {
+ id.setLanguage(checkNotNull(resolvedId.getLanguage(),
+ "missing language subtag in language alias: %s", resolvedId));
+ if (id.getScript() == null) {
+ id.setScript(resolvedId.getScript());
+ }
+ if (id.getRegion() == null) {
+ id.setRegion(resolvedId.getRegion());
+ }
+ if (id.getVariant() == null) {
+ id.setVariant(resolvedId.getVariant());
+ }
+ });
+ return id.toString();
+ }
+
+ /**
+ * Returns a suitable default calendar for a given locale if it's different from the default
+ * calendar inferred by the locale's parent.
+ *
+ * <p>Note that since the default calendar data is keyed from territory (region subtag) rather
+ * than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
+ * that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
+ * handled with hard-code special casing, but should probably be data driven eventually.
+ */
+ public Optional<String> getDefaultCalendar(String localeId) {
+ Optional<String> calendar = getSpecialCaseCalendar(localeId);
+ if (calendar.isPresent()) {
+ return calendar;
+ }
+ String t = territoryOf(localeId);
+ calendar = Optional.ofNullable(defaultCalendarMap.get(t));
+ if (!calendar.isPresent()) {
+ return Optional.empty();
+ }
+ String rootCalendar = defaultCalendarMap.get("001");
+ checkState(!rootCalendar.isEmpty(), "missing root calendar");
+ if (localeId.equals("root")) {
+ return Optional.of(rootCalendar);
+ }
+ // All locales reach "root" eventually, and that maps to territory "001" which
+ // we already know has a value, so this loop *must* exit.
+ String parentCalendar;
+ do {
+ localeId = getParent(localeId);
+ String territory = territoryOf(localeId);
+ parentCalendar = defaultCalendarMap.get(territory);
+ } while (parentCalendar == null);
+ return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
+ }
+
+ // Hack to work around the limitation that CLDR data cannot represent default calendars that
+ // change because of non-territory information. Since this is limited to exactly two cases at
+ // the moment, and is unlikely to be expanded, it's being done directly in code.
+ private Optional<String> getSpecialCaseCalendar(String localeId) {
+ Optional<String> maximized = maximize(localeId);
+ if (maximized.isPresent()) {
+ switch (maximized.get()) {
+ case "ja_Jpan_JP_TRADITIONAL":
+ return Optional.of("japanese");
+ case "th_Thai_TH_TRADITIONAL":
+ return Optional.of("buddhist");
+ }
+ }
+ return Optional.empty();
+ }
+
+ /**
+ * Returns the parent of a non-root locale ID. This is more complex than simple truncation for
+ * two reasons:
+ * <ul>
+ * <li>There may be an explicit parent locale ID specified in the CLDR data.
+ * <li>Removal of non-default script subtags makes the parent locale "root" (unless there
+ * was an explicit parent specified).
+ * </ul>
+ * Note that all valid locale ID parent "chains" must end up at "root" eventually.
+ *
+ * For example (showing parent "chains"):
+ * <ul>
+ * <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
+ * <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
+ * </ul>
+ *
+ * @throws IllegalArgumentException if the given locale ID is invalid or "root".
+ */
+ public String getParent(String localeId) {
+ checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
+ // Always defer to an explicit parent locale set in the CLDR data.
+ Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
+ if (explicitParent.isPresent()) {
+ return explicitParent.get();
+ }
+ // Now look for the start of the last ID "part" in order to truncate.
+ int lastPartSeperatorIndex = localeId.lastIndexOf('_');
+ // The parent of a base language ID (e.g. "en" or "fr") is always "root".
+ if (lastPartSeperatorIndex == -1) {
+ return "root";
+ }
+ String parentId = localeId.substring(0, lastPartSeperatorIndex);
+
+ // However, if the script of the locale is what's being truncated and it's NOT the default
+ // script for the language, return "root" as the parent rather than truncating.
+ String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
+ if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
+ return "root";
+ }
+ return !parentId.isEmpty() ? parentId : "root";
+ }
+
+ /**
+ * Returns the explicit parent of a locale ID if specified in the CLDR data.
+ *
+ * Note that this method will not return a value for most locale IDs, since they do not have
+ * an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
+ * #getParent(String)}.
+ */
+ public Optional<String> getExplicitParentLocaleOf(String localeId) {
+ return Optional.ofNullable(parentLocaleMap.get(localeId));
+ }
+
+ private String territoryOf(String localeId) {
+ return localeId.equals("root")
+ ? "001"
+ : addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
+ }
+
+ private String scriptOf(String localeId) {
+ return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
+ }
+
+ // From: https://unicode.org/reports/tr35/#Likely_Subtags
+ //
+ // Add Likely Subtags
+ // ------------------
+ // Given a source locale X, to return a locale Y where the empty subtags have been filled in
+ // by the most likely subtags. A subtag is called empty if it is a missing script or region
+ // subtag, or it is a base language subtag with the value "und".
+ //
+ // Canonicalize
+ // ------------
+ // Make sure the input locale is in canonical form ...
+ // ...
+ // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
+ //
+ // Note that this implementation does not need to handle "grandfathered" tags.
+ private Optional<LocaleId> addLikelySubtags(String localeId) {
+ if (localeId.equals("root")) {
+ return Optional.empty();
+ }
+
+ LocaleId id = LocaleId.parse(localeId);
+ // ---- LDML Specification ----
+ // Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
+ if ("Zzzz".equals(id.getScript())) {
+ id.setScript(null);
+ }
+ if ("ZZ".equals(id.getRegion())) {
+ id.setRegion(null);
+ }
+ // ---- LDML Specification ----
+ // A subtag is called empty if it is a missing script or region subtag, or it is a base
+ // language subtag with the value "und"
+ if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
+ // We are already canonical, so just return.
+ return Optional.of(id);
+ }
+ Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
+ if (!optTags.isPresent()) {
+ return Optional.empty();
+ }
+ LocaleId subtags = optTags.get();
+ checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
+ // Replace "missing" elements in the original ID with likely subtags.
+ if (id.getLanguage().equals("und")) {
+ id.setLanguage(subtags.getLanguage());
+ }
+ if (id.getScript() == null) {
+ id.setScript(checkNotNull(subtags.getScript()));
+ }
+ if (id.getRegion() == null) {
+ id.setRegion(checkNotNull(subtags.getRegion()));
+ }
+ // Language is not "und" and both script and region subtags are set!
+ return Optional.of(id);
+ }
+
+ // From: https://unicode.org/reports/tr35/#Likely_Subtags
+ //
+ // Lookup
+ // ------
+ // Lookup each of the following in order, and stop on the first match:
+ // <language>_<script>_<region>
+ // <language>_<region>
+ // <language>_<script>
+ // <language>
+ // "und"_<script>
+ private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
+ String lang = id.getLanguage();
+ String script = id.getScript();
+ String region = id.getRegion();
+ Stream<LocaleId> candidateIds = Stream.of(
+ LocaleId.of(lang, script, region),
+ LocaleId.of(lang, null, region),
+ LocaleId.of(lang, script, null),
+ LocaleId.of(lang, null, null));
+ // Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
+ // its own ("en_Latn_US") which is not intended.
+ if (script != null) {
+ candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
+ }
+ return candidateIds
+ // Remove duplicate IDs (keeps the first one encountered).
+ .distinct()
+ .map(Object::toString)
+ .map(fn)
+ .filter(Objects::nonNull)
+ .findFirst()
+ .map(LocaleId::parse);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Ascii.toLowerCase;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.BCP47;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Optional;
+import java.util.Set;
+
+import javax.annotation.Nullable;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.Ascii;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Sets;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths
+ * matching:
+ * <pre>{@code
+ * //ldmlBCP47/keyword/key[@name=*]/type[@name=*]
+ * }</pre>
+ */
+public final class Bcp47Mapper {
+ // Other attributes (e.g. "alias") are value attributes and don't need to be matched here.
+ private static final PathMatcher KEY = PathMatcher.of("ldmlBCP47/keyword/key[@name=*]");
+ private static final AttributeKey KEY_NAME = keyOf("key", "name");
+ private static final AttributeKey KEY_ALIAS = keyOf("key", "alias");
+ private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType");
+
+ private static final PathMatcher TYPE = PathMatcher.of("type[@name=*]");
+ private static final AttributeKey TYPE_NAME = keyOf("type", "name");
+ private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias");
+ private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred");
+
+ // Deprecation of the data is not the same as deprecation of attributes themselves. This
+ // deprecation relates to identifying data which exists, but is not longer the right way to
+ // represent things (which means it can be important for clients to know about).
+ private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated");
+ private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated");
+
+ // Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary
+ // information in the ICU data. If the value is equal to the declared default, it is ignored.
+ // NOTE: The need for hard-coded default values is a hack because there's not nice way (yet)
+ // to determine the default for implicit values via the DTD. Ideally this would be automatic
+ // and the AttributeKey class would be able to have a method like "isDefault(String value)".
+ private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES =
+ ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false");
+
+ private static final RbPath RB_KEYMAP = RbPath.of("keyMap");
+ private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias");
+ private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias");
+ private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias");
+
+ /**
+ * Processes data from the given supplier to generate Timezone and BCP-47 ICU data.
+ *
+ * @param src the CLDR data supplier to process.
+ * @return A list of IcuData instances containing BCP-47 data to be written to files.
+ */
+ public static ImmutableList<IcuData> process(CldrDataSupplier src) {
+ Bcp47Visitor visitor = new Bcp47Visitor();
+ src.getDataForType(BCP47).accept(ARBITRARY, visitor);
+ visitor.addKeyMapValues();
+ return ImmutableList.of(visitor.keyTypeData.icuData, visitor.tzData.icuData);
+ }
+
+ // Outer visitor which handles "key" paths by installing sub-visitor methods to process
+ // each child "type" element. Depending on the key name, values are stored in different
+ // IcuData instances.
+ private static final class Bcp47Visitor implements PrefixVisitor {
+ private final ValueCollector tzData =
+ new ValueCollector(new IcuData("timezoneTypes", false));
+ private final ValueCollector keyTypeData =
+ new ValueCollector(new IcuData("keyTypeData", false));
+
+ // The current key name from the parent path element (set when a prefix is matched).
+ @Nullable private String keyName = null;
+ // A map collecting each key and values as they are visited.
+ // TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack.
+ private Map<String, String> keyMap = new LinkedHashMap<>();
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (KEY.matches(prefix)) {
+ // Don't inline this since it also sets the field!!
+ keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix));
+
+ // How the data is visited is the same for both timezone and other BCP-47 data,
+ // it's just split into different data files, so we just install a different
+ // instance of the visitor class according to where the data in this sub-hierarchy
+ // should end up.
+ ctx.install(keyName.equals("tz") ? tzData : keyTypeData);
+ }
+ }
+
+ // Post processing to add additional captured attribute values and some special cases.
+ private void addKeyMapValues() {
+ IcuData keyData = keyTypeData.icuData;
+ // Add all the keyMap values into the IcuData file.
+ for (Entry<String, String> kmData : keyMap.entrySet()) {
+ String bcpKey = kmData.getKey();
+ String key = kmData.getValue();
+ if (bcpKey.startsWith("@")) {
+ // Undoing the weird hack in addInfoAttributes(). This can be done better.
+ // We use "parse()" because these are full paths, and not single elements.
+ keyData.add(RbPath.parse(bcpKey.substring(1)), key);
+ continue;
+ }
+ if (bcpKey.equals(key)) {
+ // An empty value indicates that the BCP47 key is same as the legacy key.
+ bcpKey = "";
+ }
+ keyData.add(RB_KEYMAP.extendBy(key), bcpKey);
+ }
+ // Add aliases for timezone data.
+ keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone");
+ keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone");
+ keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz");
+ }
+
+ private final class ValueCollector implements ValueVisitor {
+ // Mutable ICU data collected into during visitation.
+ private final IcuData icuData;
+
+ ValueCollector(IcuData data) {
+ this.icuData = checkNotNull(data);
+ }
+
+ @Override
+ public void visit(CldrValue value) {
+ checkArgument(TYPE.matchesSuffixOf(value.getPath()),
+ "unexpected child element: %s", value.getPath());
+ String typeName = TYPE_NAME.valueFrom(value);
+ // Note that if a "preferred" type exists, we treat the value specially and add
+ // it only as an alias. We expected values with a preferred replacement to
+ // always be explicitly deprecated.
+ Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value);
+ if (prefName.isPresent()) {
+ checkState(KEY_DEPRECATED.booleanValueFrom(value, false)
+ || TYPE_DEPRECATED.booleanValueFrom(value, false),
+ "unexpected 'preferred' attribute for non-deprecated value: %s", value);
+ icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get());
+ return;
+ }
+ // Note: There are some deprecated values which don't have a preferred
+ // replacement and these will be processed below (in particular we need to emit
+ // the fact that they are deprecated).
+
+ // According to the old mapper code, it's an error not to have an alias, but
+ // it's emitted via debug logging and not actually enforced.
+ // TODO: Consider making this an error if possible.
+ String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName));
+
+ keyMap.put(keyName, keyAlias);
+ RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias);
+ List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value);
+ if (typeAliases.isEmpty()) {
+ // Generate type map entry using empty value (an empty value indicates same
+ // type name is used for both BCP47 and legacy type).
+ icuData.add(typeMapPrefix.extendBy(typeName), "");
+ } else {
+ String mainAlias = typeAliases.get(0);
+ icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName);
+ // Put additional aliases as secondary aliases referencing the main alias.
+ RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias);
+ typeAliases.stream()
+ .skip(1)
+ .map(Bcp47Visitor::quoteAlias)
+ .forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias));
+ }
+ addInfoAttributes(keyName, typeName, value.getValueAttributes());
+ }
+
+ // Add any additional attributes present to the attribute map. Note that this code was
+ // copied from largely undocumented code, and the precise reasoning for why this is
+ // needed or why it's done this way is not completely clear. It is very likely that it
+ // can be simplified.
+ //
+ // The '@' symbol added here is just a magic token that gets stripped off again in the
+ // addKeyMapValues() method, it appears to just be a way to distinguish keys added via
+ // this method vs during the visit method. A better approach might just be to have two
+ // maps.
+ // TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?).
+ private void addInfoAttributes(
+ String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) {
+ // Only emit deprecation for the "key" level, even if all types below that are also
+ // marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES).
+ Set<AttributeKey> keys =
+ Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet());
+ for (AttributeKey a : keys) {
+ String value = attributes.get(a);
+ // Skip empty or default values in attributes.
+ if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) {
+ continue;
+ }
+ // The ID for the xxxInfo paths in ICU is the path fragment at which the
+ // attribute exists. Since we only process complete paths here, we must do a
+ // bit of reconstruction based on the element name of the attribute we are
+ // processing. This relies on explicit knowledge that the paths are "<key>" or
+ // "<key>/<type>". This all gets less messy if we switch to RbPath.
+ String id =
+ a.getElementName().equals("key") ? keyName : keyName + "/" + typeName;
+ keyMap.put(
+ "@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id,
+ value);
+ }
+ }
+ }
+
+ /**
+ * Escapes alias values containing '/' so they can appear in resource bundle paths. This
+ * function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar").
+ *
+ * <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar'
+ * in the CLDR data.
+ */
+ // TODO: Switch to RbPath and do quoting automatically when ICU data is written out.
+ private static String quoteAlias(String str) {
+ return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"';
+ }
+ }
+
+ private Bcp47Mapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.escape.UnicodeEscaper;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
+ * paths matching:
+ * <pre>{@code
+ * //ldml/segmentations/segmentation/suppressions/suppression
+ * //ldml/special/icu:breakIteratorData/...
+ * }</pre>
+ */
+// TODO: This class can almost certainly be replace with a small RegexTransformer config.
+public final class BreakIteratorMapper {
+ // The "type" attribute is not required here, so cannot appear in the matcher.
+ private static final PathMatcher SUPPRESSION =
+ PathMatcher.of("ldml/segmentations/segmentation/suppressions/suppression");
+ private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
+
+ // Note: This could be done with an intermediate matcher for
+ // "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it
+ private static final PathMatcher BOUNDARIES =
+ PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*");
+ private static final PathMatcher DICTIONARY =
+ PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary");
+
+ private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
+ private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
+
+ /**
+ * Processes data from the given supplier to generate break-iterator data for a set of locale
+ * IDs.
+ *
+ * @param localeId the locale ID to generate data for.
+ * @param src the CLDR data supplier to process.
+ * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+ * @return IcuData containing break-iterator data for the given locale ID.
+ */
+ public static IcuData process(
+ String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+ BreakIteratorMapper mapper = new BreakIteratorMapper(localeId);
+ icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, mapper::addSpecials));
+ src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, mapper::addSuppression);
+ return mapper.icuData;
+ }
+
+ // The per-locale ICU data being collected by this visitor.
+ private final IcuData icuData;
+
+ private BreakIteratorMapper(String localeId) {
+ this.icuData = new IcuData(localeId, true);
+ }
+
+ private void addSuppression(CldrValue v) {
+ if (SUPPRESSION.matches(v.getPath())) {
+ String type = SEGMENTATION_TYPE.valueFrom(v);
+ // TODO: Understand and document why we escape values here, but not for collation data.
+ icuData.add(
+ RbPath.of("exceptions", type + ":array"),
+ ESCAPE_NON_ASCII.escape(v.getValue()));
+ }
+ }
+
+ private void addSpecials(CldrValue v) {
+ CldrPath p = v.getPath();
+ if (BOUNDARIES.matches(p)) {
+ addDependency(
+ getDependencyName(v),
+ getBoundaryType(v),
+ getBoundaryDependency(v));
+ } else if (DICTIONARY.matches(p)) {
+ addDependency(
+ getDependencyName(v),
+ DICTIONARY_TYPE.valueFrom(v),
+ DICTIONARY_DEP.optionalValueFrom(v));
+ }
+ }
+
+ private void addDependency(String name, String type, Optional<String> dependency) {
+ icuData.add(
+ RbPath.of(name, type + ":process(dependency)"),
+ dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
+ }
+
+ // Must match the BOUNDARIES or DICTIONARY path.
+ private static String getDependencyName(CldrValue value) {
+ return stripXmlNamespace(value.getPath().getParent().getName());
+ }
+
+ // Must match the BOUNDARIES path.
+ private static String getBoundaryType(CldrValue value) {
+ String elementName = value.getPath().getName();
+ String type = stripXmlNamespace(elementName);
+ return keyOf(elementName, "alt")
+ .optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
+ }
+
+ // Must match the BOUNDARIES path.
+ private static Optional<String> getBoundaryDependency(CldrValue value) {
+ return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
+ }
+
+ // Strips the first prefix of the form "xxx:" from a string.
+ private static String stripXmlNamespace(String s) {
+ return s.substring(s.indexOf(':') + 1);
+ }
+
+ /*
+ * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
+ * backslash to a double backslash. This class is super slow for non-ASCII escaping due to
+ * using "String.format()", however there's < 100 values that need any escaping, so it's fine.
+ */
+ private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
+ private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
+
+ @Override
+ protected char[] escape(int cp) {
+ // Returning null means "do not escape".
+ if (0x0020 <= cp && cp <= 0x007F) {
+ return cp == '\\' ? DOUBLE_BACKSLASH : null;
+ } else if (cp <= 0xFFFF) {
+ return String.format("\\u%04X", cp).toCharArray();
+ }
+ return String.format("\\U%08X", cp).toCharArray();
+ }
+ };
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+/**
+ * A mapper to collect collation data from {@link CldrDataType#LDML LDML} data via the paths:
+ * <pre>{@code
+ * //ldml/collations/*
+ * //ldml/special/icu:UCARules
+ * //ldml/special/icu:depends
+ * }</pre>
+ */
+public final class CollationMapper {
+ private static final PathMatcher COLLATIONS = PathMatcher.of("ldml/collations");
+
+ // Note that the 'type' attribute is optional, so cannot be in the path matcher.
+ // However since the CLDR data never actually omits the value, it would be easy to change the
+ // attribute metadata to stop it being an implicit attribute and then it could appear.
+ private static final PathMatcher COLLATION_RULE = PathMatcher.of("collation/cr");
+ private static final AttributeKey COLLATION_TYPE = keyOf("collation", "type");
+ private static final AttributeKey COLLATION_RULE_ALT = keyOf("cr", "alt");
+
+ private static final PathMatcher DEFAULT_COLLATION = PathMatcher.of("defaultCollation");
+
+ private static final PathMatcher SPECIAL = PathMatcher.of("ldml/special");
+ private static final AttributeKey SPECIAL_RULES = keyOf("icu:UCARules", "icu:uca_rules");
+ private static final AttributeKey SPECIAL_DEP = keyOf("icu:depends", "icu:dependency");
+
+ private static final RbPath RB_COLLATIONS_DEFAULT = RbPath.of("collations", "default");
+ private static final RbPath RB_STANDARD_SEQUENCE =
+ RbPath.of("collations", "standard", "Sequence");
+ private static final RbPath RB_STANDARD_VERSION =
+ RbPath.of("collations", "standard", "Version");
+
+ private static final Splitter LINE_SPLITTER =
+ Splitter.on('\n').trimResults().omitEmptyStrings();
+
+ /**
+ * Processes data from the given supplier to generate collation data for a set of locale IDs.
+ *
+ * @param localeId the locale ID to generate data for.
+ * @param src the CLDR data supplier to process.
+ * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+ * @return IcuData containing RBNF data for the given locale ID.
+ */
+ public static IcuData process(
+ String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+ CollationVisitor visitor = new CollationVisitor(localeId);
+ icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, visitor));
+ src.getDataForLocale(localeId, UNRESOLVED).accept(ARBITRARY, visitor);
+ return visitor.icuData;
+ }
+
+ final static class CollationVisitor implements PrefixVisitor {
+ private final IcuData icuData;
+
+ CollationVisitor(String localeId) {
+ this.icuData = new IcuData(localeId, true);
+ // Super special hack case because the XML data is a bit broken for the root collation
+ // data (there's an empty <collation> element that's a non-leaf element and thus not
+ // visited, but we should add an empty sequence to the output data.
+ if (localeId.equals("root")) {
+ icuData.replace(RB_STANDARD_SEQUENCE, "");
+ // TODO: Collation versioning probably needs to be improved.
+ icuData.replace(RB_STANDARD_VERSION, CldrDataSupplier.getCldrVersionString());
+ }
+ }
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (COLLATIONS.matchesPrefixOf(prefix)) {
+ ctx.install(this::collectRules);
+ } else if (SPECIAL.matchesPrefixOf(prefix)) {
+ ctx.install(this::maybeAddSpecial);
+ }
+ }
+
+ private void collectRules(CldrValue v) {
+ CldrPath p = v.getPath();
+ if (COLLATION_RULE.matchesSuffixOf(p)) {
+ String type = COLLATION_TYPE.valueFrom(v);
+ RbPath rbPath = RbPath.of("collations", type, "Sequence");
+
+ // WARNING: This is almost certainly a bug, since while @type can have the value
+ // "short" it can also have other values. This code was copied from CollationMapper
+ // which has the line;
+ // isShort = attr.getValue("alt") != null;
+ boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent();
+
+ // Note that it's not clear why there's a check for "contains()" here. The code
+ // from which this was derived is largely undocumented and this check could have
+ // been overly defensive (perhaps a duplicate key should be an error?).
+ if (isShort || !icuData.contains(rbPath)) {
+ RbValue rules = RbValue.of(
+ LINE_SPLITTER.splitToList(v.getValue()).stream()
+ .map(CollationMapper::removeComment)
+ .filter(s -> !s.isEmpty())::iterator);
+ icuData.replace(rbPath, rules);
+ icuData.replace(
+ RbPath.of("collations", type, "Version"),
+ CldrDataSupplier.getCldrVersionString());
+ }
+ } else if (DEFAULT_COLLATION.matchesSuffixOf(p)) {
+ icuData.add(RB_COLLATIONS_DEFAULT, v.getValue());
+ }
+ }
+
+ // This is a bit special since the attribute we want to add depends on the element we are
+ // visiting (which is somewhat unusual in the transformation classes).
+ private void maybeAddSpecial(CldrValue value) {
+ AttributeKey key;
+ switch (value.getPath().getName()) {
+ case "icu:UCARules":
+ key = SPECIAL_RULES;
+ break;
+ case "icu:depends":
+ key = SPECIAL_DEP;
+ break;
+ default:
+ return;
+ }
+ // substring(4) just removes the "icu:" prefix (which we know is present in the key).
+ RbPath rbPath = RbPath.of(
+ String.format("%s:process(%s)",
+ key.getElementName().substring(4), key.getAttributeName().substring(4)));
+ icuData.add(rbPath, key.valueFrom(value));
+ }
+ }
+
+ // Collation data can contain # to mark an end-of-line comment, but it can also contain data
+ // with # in it. In the latter case it must be in a single-quoted string (e.g. 'x#y'). However
+ // the precise semantics of the quoting rules are not particularly clear, so this method
+ // assumes that:
+ // * single quote (apostrophe) begins and ends quoting.
+ // * outside a quoted section, all characters are literal.
+ // * inside a quoted section, backslash '\' escapes any single character (e.g \a, \', \\)
+ private static String removeComment(String s) {
+ int i = findCommentStart(s);
+ if (i >= 0) {
+ s = CharMatcher.whitespace().trimTrailingFrom(s.substring(0, i));
+ }
+ return s;
+ }
+
+ // Returns the index of the first unquoted '#' in the string.
+ private static int findCommentStart(String s) {
+ boolean quoted = false;
+ for (int i = 0; i < s.length(); i++) {
+ switch (s.charAt(i)) {
+ case '\'':
+ quoted = !quoted;
+ break;
+
+ case '\\':
+ if (quoted) {
+ i++;
+ }
+ break;
+
+ case '#':
+ if (!quoted) {
+ return i;
+ }
+ break;
+
+ default:
+ // Do nothing and consume the character
+ }
+ }
+ checkArgument(!quoted, "mismatched quotes in: %s", s);
+ return -1;
+ }
+
+ private CollationMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.util.Optional;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect day-period data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
+ * data via the paths:
+ * <pre>{@code
+ * //supplementalData/dayPeriodRuleSet/*
+ * }</pre>
+ */
+public final class DayPeriodsMapper {
+ private static final PathMatcher RULESET =
+ PathMatcher.of("supplementalData/dayPeriodRuleSet");
+ private static final AttributeKey RULESET_TYPE = keyOf("dayPeriodRuleSet", "type");
+
+ private static final PathMatcher RULES = PathMatcher.of("dayPeriodRules[@locales=*]");
+ private static final AttributeKey RULES_LOCALES = keyOf("dayPeriodRules", "locales");
+
+ private static final PathMatcher RULE = PathMatcher.of("dayPeriodRule[@type=*]");
+ private static final AttributeKey RULE_TYPE = keyOf("dayPeriodRule", "type");
+
+ private static final RbPath RB_LOCALES = RbPath.of("locales");
+
+ /**
+ * Processes data from the given supplier to generate day-period ICU data.
+ *
+ * @param src the CLDR data supplier to process.
+ * @return the IcuData instance to be written to a file.
+ */
+ public static IcuData process(CldrDataSupplier src) {
+ RuleSetVisitor mapper = new RuleSetVisitor();
+ CldrData data = src.getDataForType(SUPPLEMENTAL);
+ data.accept(ARBITRARY, mapper);
+ return mapper.icuData;
+ }
+
+ private static final class RuleSetVisitor implements PrefixVisitor {
+ // Mutable ICU data collected into during visitation.
+ private final IcuData icuData = new IcuData("dayPeriods", false);
+ private int setNum = 0;
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (RULESET.matches(prefix)) {
+ ctx.install(new RuleVisitor(RULESET_TYPE.optionalValueFrom(prefix)));
+ }
+ }
+
+ private final class RuleVisitor implements PrefixVisitor {
+ private final RbPath localePrefix;
+
+ private RuleVisitor(Optional<String> type) {
+ // If there's a given type, add it to the prefix path.
+ this.localePrefix = type.map(t -> RbPath.of("locales_" + t)).orElse(RB_LOCALES);
+ }
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (RULES.matchesSuffixOf(prefix)) {
+ // Sets are arbitrarily identified by the string "setNN".
+ String setName = "set" + (++setNum);
+ RULES_LOCALES.listOfValuesFrom(prefix)
+ .forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName));
+ ctx.install(this::visitRule);
+ }
+ }
+
+ private void visitRule(CldrValue value) {
+ if (RULE.matchesSuffixOf(value.getPath())) {
+ RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value));
+ value.getValueAttributes()
+ .forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v));
+ }
+ }
+ }
+ }
+
+ private DayPeriodsMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.Ordering.natural;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.SetMultimap;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+import org.unicode.icu.tool.cldrtoicu.SupplementalData;
+
+/**
+ * Generate locale {@link IcuData} by transforming {@link CldrDataType#LDML LDML} data using a
+ * {@link PathValueTransformer}.
+ *
+ * <p>This is currently driven by the {@code ldml2icu_locale.txt} configuration file via a
+ * {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
+ */
+public final class LocaleMapper {
+ // Match territory paths so we can skip processing deprecated territories.
+ private static final PathMatcher TERRITORY = PathMatcher.of(
+ "ldml/localeDisplayNames/territories/territory[@type=*]");
+ private static final AttributeKey TERRITORY_TYPE = keyOf("territory", "type");
+
+ // The default calendar (only set is different from inherited parent value).
+ private static final RbPath RB_CALENDAR = RbPath.of("calendar", "default");
+
+ /**
+ * Processes data from the given supplier to generate general locale data for the given locale
+ * ID.
+ *
+ * @param localeId the locale ID to generate data for.
+ * @param src the CLDR data supplier to process.
+ * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+ * @param transformer the transformer to match and transform each CLDR path/value pair.
+ * @param supplementalData additional necessary data derived from
+ * {@link org.unicode.cldr.api.CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data.
+ * @return IcuData containing locale data for the given locale ID.
+ */
+ public static IcuData process(
+ String localeId,
+ CldrDataSupplier src,
+ Optional<CldrData> icuSpecialData,
+ PathValueTransformer transformer,
+ SupplementalData supplementalData) {
+
+ IcuData icuData = new IcuData(localeId, true);
+ // Write out the results into the IcuData class, preserving result grouping and expanding
+ // path references as necessary.
+ ResultsCollector collector = new ResultsCollector(transformer);
+ icuData.addResults(collector.collectResultsFor(localeId, src, icuSpecialData));
+ doDateTimeHack(icuData);
+ supplementalData.getDefaultCalendar(icuData.getName())
+ .ifPresent(c -> icuData.add(RB_CALENDAR, c));
+ return icuData;
+ }
+
+ // This is an awful hack for post-processing the date-time format patterns to inject a 13th
+ // pattern at index 8, which is just a duplicate of the "medium" date-time pattern. The reasons
+ // for this are lost in the midst of time, but essentially there's ICU library code that just
+ // expects the value at index 8 to be this "default" value, and reads the date-time values
+ // starting at index 9.
+ //
+ // Before the hack would be at index 10, since there are 3 groups:
+ // "time" -> "date" -> "date-time"
+ // with 4 patterns each:
+ // "full" -> "long" -> "medium" -> "short"
+ private static void doDateTimeHack(IcuData icuData) {
+ for (RbPath rbPath : icuData.getPaths()) {
+ if (rbPath.length() == 3
+ && rbPath.getSegment(0).equals("calendar")
+ && rbPath.getSegment(2).equals("DateTimePatterns")) {
+ // This cannot be null and should not be empty, since the path is in this data.
+ List<RbValue> valuesToHack = icuData.get(rbPath);
+ checkArgument(valuesToHack.size() == 12,
+ "unexpected number of date/time patterns for '%s': %s", rbPath, valuesToHack);
+ valuesToHack.add(8, valuesToHack.get(10));
+ }
+ }
+ }
+
+ private static final class ResultsCollector {
+ private final PathValueTransformer transformer;
+ private final Set<RbPath> validRbPaths = new HashSet<>();
+
+ // WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
+ // each key. The reason is that result comparison is not "consistent with equals", and
+ // TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
+ // method), and it does this even if using the add() method of the sorted set (this is in
+ // fact in violation of the stated behaviour of Set#add).
+ private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
+
+ ResultsCollector(PathValueTransformer transformer) {
+ this.transformer = checkNotNull(transformer);
+ }
+
+ ImmutableListMultimap<RbPath, Result> collectResultsFor(
+ String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+ CldrData unresolved = src.getDataForLocale(localeId, UNRESOLVED);
+ CldrData resolved = src.getDataForLocale(localeId, RESOLVED);
+ DynamicVars varFn = p -> {
+ CldrValue cldrValue = resolved.get(p);
+ return cldrValue != null ? cldrValue.getValue() : null;
+ };
+
+ collectPaths(unresolved, varFn);
+ collectResults(resolved, varFn);
+ icuSpecialData.ifPresent(s -> collectSpecials(s, varFn));
+
+ ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
+ out.orderValuesBy(natural());
+ for (RbPath rbPath : resultsByRbPath.keySet()) {
+ Set<Result> existingResults = resultsByRbPath.get(rbPath);
+ out.putAll(rbPath, existingResults);
+ for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
+ if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
+ out.put(rbPath, fallback);
+ }
+ }
+ }
+ return out.build();
+ }
+
+ private void collectPaths(CldrData unresolved, DynamicVars varFn) {
+ ValueVisitor collectPaths =
+ v -> transformer.transform(v, varFn).forEach(this::collectResultPath);
+ unresolved.accept(DTD, collectPaths);
+ }
+
+ private void collectResultPath(Result result) {
+ RbPath rbPath = result.getKey();
+ validRbPaths.add(rbPath);
+ if (rbPath.isAnonymous()) {
+ RbPath parent = rbPath.getParent();
+ checkState(!parent.isAnonymous(),
+ "anonymous paths should not be nested: %s", rbPath);
+ validRbPaths.add(parent);
+ }
+ }
+
+ void collectResults(CldrData resolved, DynamicVars varFn) {
+ ValueVisitor collectResults =
+ v -> transformer.transform(v, varFn).stream()
+ .filter(r -> validRbPaths.contains(r.getKey()))
+ .forEach(r -> resultsByRbPath.put(r.getKey(), r));
+ resolved.accept(DTD, collectResults);
+ }
+
+ private void collectSpecials(CldrData cldrData, DynamicVars varFn) {
+ cldrData.accept(DTD, v ->
+ transformer.transform(v, varFn).forEach(r -> resultsByRbPath.put(r.getKey(), r)));
+ }
+ }
+
+ private LocaleMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
+ * the paths:
+ * <pre>{@code
+ * //supplementalData/plurals/pluralRanges[@locales=*]/...
+ * }</pre>
+ */
+public final class PluralRangesMapper {
+ private static final PathMatcher RANGES =
+ PathMatcher.of("supplementalData/plurals/pluralRanges[@locales=*]");
+ private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales");
+
+ private static final PathMatcher RANGE = PathMatcher.of("pluralRange[@start=*][@end=*]");
+ private static final AttributeKey RANGE_START = keyOf("pluralRange", "start");
+ private static final AttributeKey RANGE_END = keyOf("pluralRange", "end");
+ private static final AttributeKey RANGE_RESULT = keyOf("pluralRange", "result");
+
+ private static final RbPath RB_RULES = RbPath.of("rules");
+ private static final RbPath RB_LOCALES = RbPath.of("locales");
+
+ /**
+ * Processes data from the given supplier to generate plural-range ICU data.
+ *
+ * @param src the CLDR data supplier to process.
+ * @return the IcuData instance to be written to a file.
+ */
+ public static IcuData process(CldrDataSupplier src) {
+ PluralRangesVisitor visitor = new PluralRangesVisitor();
+ CldrData data = src.getDataForType(SUPPLEMENTAL);
+ data.accept(ARBITRARY, visitor);
+ return visitor.icuData;
+ }
+
+ private static final class PluralRangesVisitor implements PrefixVisitor {
+ private final IcuData icuData = new IcuData("pluralRanges", false);
+
+ private int setIndex = 0;
+ private String ruleLabel = null;
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ // Captured type is either "cardinal" or "ordinal" (and will cause exception otherwise).
+ if (RANGES.matches(prefix)) {
+ ruleLabel = String.format("set%02d", setIndex++);
+ RANGES_LOCALES.listOfValuesFrom(prefix)
+ .forEach(l -> icuData.add(RB_LOCALES.extendBy(l), ruleLabel));
+ ctx.install(this::visitRange);
+ }
+ }
+
+ private void visitRange(CldrValue value) {
+ checkState(RANGE.matchesSuffixOf(value.getPath()),
+ "unexpected path: %s", value.getPath());
+ // Note: "range:start" and "range:end" are optional attributes, but the CLDR DTD
+ // specifies a default via comments. They should probably be changed to just have a
+ // default in the DTD (and possibly converted to use an enum here).
+ icuData.add(RB_RULES.extendBy(ruleLabel),
+ RbValue.of(
+ RANGE_START.valueFrom(value, "all"),
+ RANGE_END.valueFrom(value, "all"),
+ RANGE_RESULT.valueFrom(value)));
+ }
+ }
+
+ private PluralRangesMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
+ * the paths:
+ * <pre>{@code
+ * //supplementalData/plurals[@type=*]/pluralRules[@locales=*]/pluralRule[@count=*]
+ * }</pre>
+ */
+public final class PluralsMapper {
+ private static final PathMatcher PLURALS = PathMatcher.of("supplementalData/plurals[@type=*]");
+ private static final AttributeKey PLURALS_TYPE = keyOf("plurals", "type");
+
+ private static final PathMatcher RULES = PathMatcher.of("pluralRules[@locales=*]");
+ private static final AttributeKey RULES_LOCALES = keyOf("pluralRules", "locales");
+
+ private static final PathMatcher RULE = PathMatcher.of("pluralRule[@count=*]");
+ private static final AttributeKey RULE_COUNT = keyOf("pluralRule", "count");
+
+ private static final ImmutableMap<String, RbPath> ICU_PREFIX_MAP =
+ ImmutableMap.of("cardinal", RbPath.of("locales"), "ordinal", RbPath.of("locales_ordinals"));
+
+ /**
+ * Processes data from the given supplier to generate plural ICU data.
+ *
+ * @param src the CLDR data supplier to process.
+ * @return the IcuData instance to be written to a file.
+ */
+ public static IcuData process(CldrDataSupplier src) {
+ PluralsVisitor visitor = new PluralsVisitor();
+ CldrData data = src.getDataForType(SUPPLEMENTAL);
+ // Note: We explicitly reset the type to mimic the order of the existing code, since this
+ // affects the set indices we generate during processing. Ideally this would all be immune
+ // to ordering (or just enforce DTD ordering) but right now it's very dependent on
+ // mimicking the order of the existing code to get identical output.
+ data.accept(ARBITRARY, visitor.setType("cardinal"));
+ data.accept(ARBITRARY, visitor.setType("ordinal"));
+ return visitor.icuData;
+ }
+
+ private static final class PluralsVisitor implements PrefixVisitor {
+ // Mutable ICU data collected into during visitation.
+ // In a post XML-aware API, is recording the XML file names really a good idea?
+ private final IcuData icuData = new IcuData("plurals", false);
+ // Filter for the type we are processing now (this could be removed if we don't mind which
+ // order the types are processed, and switching to DTD ordering would make it stable).
+ private String type = null;
+ private final List<ImmutableMap<String, String>> previousRules = new ArrayList<>();
+
+ // Hack method to allow a single type to be processed at a time (the visitor would otherwise
+ // happily handle both types in a single pass). We can't do this as two different visitors
+ // (one for each type) because the current behaviour relies on carrying over the calculated
+ // set numbers from one pass to the next. Once migration is complete we should revisit this
+ // and allow this visitor to work in a single pass (probably with DTD order for stability).
+ PluralsVisitor setType(String type) {
+ this.type = checkNotNull(type);
+ return this;
+ }
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (PLURALS.matches(prefix)) {
+ // Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a
+ // default via comments. It should probably be changed to just have a default in
+ // the DTD.
+ if (PLURALS_TYPE.valueFrom(prefix, "cardinal").equals(type)) {
+ ctx.install(new RulesVisitor(ICU_PREFIX_MAP.get(type)));
+ }
+ }
+ }
+
+ private final class RulesVisitor implements PrefixVisitor {
+ private final RbPath icuPrefix;
+ private final List<String> locales = new ArrayList<>();
+ private final Map<String, String> rules = new LinkedHashMap<>();
+
+ RulesVisitor(RbPath icuPrefix) {
+ this.icuPrefix = checkNotNull(icuPrefix);
+ }
+
+ @Override
+ public void visitPrefixStart(CldrPath prefix, Context ctx) {
+ if (RULES.matchesSuffixOf(prefix)) {
+ Iterables.addAll(locales, RULES_LOCALES.listOfValuesFrom(prefix));
+ ctx.install(value -> {
+ if (RULE.matchesSuffixOf(value.getPath())) {
+ rules.put(RULE_COUNT.valueFrom(value), value.getValue());
+ }
+ });
+ }
+ }
+
+ @Override
+ public void visitPrefixEnd(CldrPath prefix) {
+ checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix);
+ // Note: The original mapper code "sort of" coped with empty rules, but it's not
+ // completely well behaved (or documented), so since this doesn't happen in the
+ // current CLDR data, I decided to just prohibit it in the new code. Support can
+ // easily be added in once the expected semantics are clear.
+ checkState(!rules.isEmpty(), "missing rule data for plurals: %s", prefix);
+
+ // Have we seen this set of rules before? If so, reuse the existing index. Note
+ // that an IDE might report this call as suspicious because the key is not yet an
+ // immutable map (saves creating immutable maps just to check for inclusion) but
+ // this is fine because collection equality is based only on contents, not
+ // collection type.
+ int idx = previousRules.indexOf(rules);
+ if (idx == -1) {
+ int newIdx = previousRules.size();
+ rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v));
+ // Since "rules" is mutable and reused, we must take an immutable copy here.
+ previousRules.add(ImmutableMap.copyOf(rules));
+ idx = newIdx;
+ }
+ String setName = "set" + idx;
+ locales.forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName));
+ rules.clear();
+ locales.clear();
+ }
+ }
+ }
+
+ private PluralsMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
+
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrData.PrefixVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.escape.UnicodeEscaper;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A mapper to collect plural data from {@link CldrDataType#LDML LDML} data via the paths:
+ * <pre>{@code
+ * //ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]
+ * }</pre>
+ */
+// TODO: This class can almost certainly be written using RegexTransformer and a small config.
+public final class RbnfMapper {
+ private static final PathMatcher RULE_SET =
+ PathMatcher.of("ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]");
+ private static final AttributeKey GROUPING_TYPE = keyOf("rulesetGrouping", "type");
+ private static final AttributeKey RULESET_TYPE = keyOf("ruleset", "type");
+
+ private static final PathMatcher RBNF_RULE = PathMatcher.of("rbnfrule");
+ private static final AttributeKey RBNF_VALUE = keyOf("rbnfrule", "value");
+ private static final AttributeKey RBNF_RADIX = keyOf("rbnfrule", "radix");
+ private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access");
+
+ private static final RbPath RB_PARENT = RbPath.of("%%Parent");
+ // This is the ICU path prefix, below which everything generated by this visitor will go.
+ private static final RbPath RB_ROOT = RbPath.of("RBNFRules");
+
+ /**
+ * Processes data from the given supplier to generate RBNF data for a set of locale IDs.
+ *
+ * @param localeId the locale ID to generate data for.
+ * @param src the CLDR data supplier to process.
+ * @param icuSpecialData additional ICU data (in the "icu:" namespace)
+ * @return IcuData containing RBNF data for the given locale ID.
+ */
+ public static IcuData process(
+ String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
+
+ // Using DTD order is essential here because the RBNF paths contain ordered elements,
+ // so we must ensure that they appear in sorted order (otherwise we'd have to do more
+ // work at this end to re-sort the results).
+ RulesetVisitor visitor = new RulesetVisitor(localeId);
+ icuSpecialData.ifPresent(s -> s.accept(DTD, visitor));
+ src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, visitor);
+ return visitor.icuData;
+ }
+
+ static final class RulesetVisitor implements PrefixVisitor {
+
+ private final IcuData icuData;
+
+ private RulesetVisitor(String localeId) {
+ this.icuData = new IcuData(localeId, true);
+ }
+
+ @Override public void visitPrefixStart(CldrPath prefix, Context context) {
+ if (RULE_SET.matchesPrefixOf(prefix)) {
+ RbPath rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix));
+ String rulesetType = RULESET_TYPE.valueFrom(prefix);
+ boolean isStrict = !"lenient-parse".equals(rulesetType);
+
+ // This is rather hacky because the access attribute lives on the parent path
+ // element, but we cannot use it until we visit the child values (because it's a
+ // value attribute and will not be in the prefix path. So we need to add the header
+ // only once, just before we start adding the values relating to the child
+ // elements, so we need a flag.
+ //
+ // This cannot be a boolean field since it must be "effectively final".
+ AtomicBoolean hasHeader = new AtomicBoolean(false);
+ context.install(
+ value -> {
+ if (RBNF_RULE.matchesSuffixOf(value.getPath())) {
+ if (!hasHeader.get()) {
+ boolean isPrivate =
+ RULESET_ACCESS.valueFrom(value, "public").equals("private");
+ icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":");
+ hasHeader.set(true);
+ }
+ String rulePrefix = "";
+ if (isStrict) {
+ String basePrefix = RBNF_VALUE.valueFrom(value);
+ rulePrefix = RBNF_RADIX.optionalValueFrom(value)
+ .map(r -> basePrefix + "/" + r)
+ .orElse(basePrefix);
+ rulePrefix += ": ";
+ }
+ icuData.add(
+ rbPath,
+ rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue()));
+ }
+ });
+ }
+ }
+
+ /*
+ * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
+ * backslash to a double backslash. This class is super slow for non-ASCII escaping due to
+ * using "String.format()", however there's < 100 values that need any escaping, so it's
+ * fine.
+ */
+ private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() {
+ private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
+ private final char[] LEFT_ANGLE = "<".toCharArray();
+ private final char[] RIGHT_ANGLE = ">".toCharArray();
+
+ @Override
+ protected char[] escape(int cp) {
+ // Returning null means "do not escape".
+ switch (cp) {
+ case '\\':
+ return DOUBLE_BACKSLASH;
+ case '←':
+ return LEFT_ANGLE;
+ case '→':
+ return RIGHT_ANGLE;
+ default:
+ if (0x0020 <= cp && cp <= 0x007F) {
+ return null;
+ } else if (cp <= 0xFFFF) {
+ return String.format("\\u%04X", cp).toCharArray();
+ }
+ return String.format("\\U%08X", cp).toCharArray();
+ }
+ }
+ };
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.Ordering.natural;
+import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING;
+
+import java.util.Set;
+
+import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.SetMultimap;
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * Generate supplemental {@link IcuData} by transforming {@link CldrDataType#SUPPLEMENTAL
+ * SUPPLEMENTAL} data using a {@link PathValueTransformer}.
+ *
+ * <p>This is currently driven by the {@code ldml2icu_supplemental.txt} configuration file via a
+ * {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
+ */
+public final class SupplementalMapper {
+ private static final RbPath RB_FIFO = RbPath.of("<FIFO>");
+
+ /**
+ * Processes a subset of supplemental data from the given supplier.
+ *
+ * @param src the CLDR data supplier to process.
+ * @param transformer the transformer to match and transform each CLDR path/value pair.
+ * @param icuName the name for the generated IcuData.
+ * @param includePaths a matcher to select the CLDR paths to be transformed.
+ * @return An IcuData instance containing the specified subset of supplemental data with the
+ * given ICU name.
+ */
+ // TODO: Improve external data splitting and remove need for a PathMatcher here.
+ public static IcuData process(
+ CldrDataSupplier src, PathValueTransformer transformer, String icuName,
+ PathMatcher includePaths) {
+ ResultsCollector collector = new ResultsCollector(includePaths, transformer);
+ // Write out the results into the IcuData class, preserving result grouping and expanding
+ // path references as necessary.
+ IcuData icuData = new IcuData(icuName, false);
+ icuData.addResults(collector.getResults(src));
+ return icuData;
+ }
+
+ private static final class ResultsCollector {
+ private final PathMatcher pathMatcher;
+ private final PathValueTransformer transformer;
+
+ // WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
+ // each key. The reason is that result comparison is not "consistent with equals", and
+ // TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
+ // method), and it does this even if using the add() method of the sorted set (this is in
+ // fact in violation of the stated behaviour of Set#add).
+ private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
+ private int fifoCounter = 0;
+
+ ResultsCollector(PathMatcher pathMatcher, PathValueTransformer transformer) {
+ this.pathMatcher = checkNotNull(pathMatcher);
+ this.transformer = checkNotNull(transformer);
+ }
+
+ private void visit(CldrValue value) {
+ if (pathMatcher.matchesPrefixOf(value.getPath())) {
+ for (Result r : transformer.transform(value)) {
+ RbPath rbPath = r.getKey();
+ if (rbPath.contains(RB_FIFO)) {
+ // The fifo counter needs to be formatted with leading zeros for sorting.
+ rbPath = rbPath.mapSegments(
+ s -> s.equals("<FIFO>") ? String.format("<%04d>", fifoCounter) : s);
+ }
+ resultsByRbPath.put(rbPath, r);
+ }
+ fifoCounter++;
+ }
+ }
+
+ ImmutableListMultimap<RbPath, Result> getResults(CldrDataSupplier supplier) {
+ // DTD and NESTED_GROUPING order differ because of how the magic <FIFO> label works (it
+ // basically enforces "encounter order" onto things in unlabeled sequences, which matches
+ // the old behaviour). If it wouldn't break anything, it might be worth moving to DTD order
+ // to remove any lingering implicit dependencies on the CLDR data behaviour.
+ CldrData supplementalData = supplier.getDataForType(CldrDataType.SUPPLEMENTAL);
+ PathValueTransformer.DynamicVars varFn = p -> {
+ CldrValue cldrValue = supplementalData.get(p);
+ return cldrValue != null ? cldrValue.getValue() : null;
+ };
+
+ supplementalData.accept(NESTED_GROUPING, this::visit);
+
+ ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
+ out.orderValuesBy(natural());
+ for (RbPath rbPath : resultsByRbPath.keySet()) {
+ Set<Result> existingResults = resultsByRbPath.get(rbPath);
+ out.putAll(rbPath, existingResults);
+ for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
+ if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
+ out.put(rbPath, fallback);
+ }
+ }
+ }
+ return out.build();
+ }
+ }
+
+ private SupplementalMapper() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.mapper;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.nio.file.StandardOpenOption.CREATE;
+import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
+import static org.unicode.cldr.api.AttributeKey.keyOf;
+import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Optional;
+import java.util.function.Function;
+
+import org.unicode.cldr.api.AttributeKey;
+import org.unicode.cldr.api.CldrData.ValueVisitor;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrValue;
+
+import org.unicode.icu.tool.cldrtoicu.IcuData;
+import org.unicode.icu.tool.cldrtoicu.PathMatcher;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+import org.unicode.icu.tool.cldrtoicu.RbValue;
+import com.ibm.icu.text.Transliterator;
+
+/**
+ * A mapper to collect transliteration data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
+ * data via the paths:
+ * <pre>{@code
+ * //supplementalData/transforms/transform/tRule
+ * }</pre>
+ *
+ * <p>This mapper also writes out the transform rule files into a specified directory.
+ */
+public final class TransformsMapper {
+ private static final PathMatcher TRULE =
+ PathMatcher.of("supplementalData/transforms/transform/tRule");
+ private static final AttributeKey TRANSFORM_SOURCE = keyOf("transform", "source");
+ private static final AttributeKey TRANSFORM_TARGET = keyOf("transform", "target");
+ private static final AttributeKey TRANSFORM_DIRECTION = keyOf("transform", "direction");
+ private static final AttributeKey TRANSFORM_VARIANT = keyOf("transform", "variant");
+ private static final AttributeKey TRANSFORM_VISIBILITY = keyOf("transform", "visibility");
+ private static final AttributeKey TRANSFORM_ALIAS = keyOf("transform", "alias");
+ private static final AttributeKey TRANSFORM_BACKALIAS = keyOf("transform", "backwardAlias");
+
+ private static final RbPath RB_TRANSLITERATOR_IDS = RbPath.of("RuleBasedTransliteratorIDs");
+
+ // This decomposes some accented characters with accents in the "Mn" (Mark, non-spacing)
+ // Unicode range by representing the accents in the \u1234 hex form. For example, it converts:
+ // "ɪ̈" to "ɪ\u0308" and "ɯ̽" to "ɯ\u033D". This does not affect all accented character (e.g.
+ // ä) and the precise reason this is done was never clearly documented in the code from which
+ // this code was derived (but it seems necessary to generate the expected output in the
+ // transliteration rules).
+ //
+ // This is one of the only, apparently necessary direct dependencies on the icu4j library.
+ // TODO: Make this depend icu4j from this project rather than the older version from CLDR.
+ private static final Transliterator FIXUP = Transliterator.getInstance("[:Mn:]any-hex/java");
+
+ // Don't rename these enum constants, they need to match the data directly.
+ private enum Direction { forward, backward, both }
+ private enum Visibility { internal, external }
+
+ /**
+ * Processes data from the given supplier to generate transliteration ICU data, writing
+ * auxiliary transliteration rule files in the process. This is a potentially destructive call
+ * and will overwrite existing transformation rule files in the specified directory.
+ *
+ * @param src the CLDR data supplier to process.
+ * @param ruleFileOutputDir the directory into which transliteration rule files will be written.
+ * @return the IcuData instance to be written to a file.
+ */
+ public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
+ RuleVisitor visitor = new RuleVisitor(p -> {
+ Path file = ruleFileOutputDir.resolve(p);
+ try {
+ return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
+ } catch (IOException e) {
+ throw new RuntimeException("error opening file: " + file, e);
+ }
+ });
+ src.getDataForType(SUPPLEMENTAL).accept(DTD, visitor);
+ return visitor.icuData;
+ }
+
+ private static class RuleVisitor implements ValueVisitor {
+ private final IcuData icuData = new IcuData("root", false);
+ private final Function<Path, PrintWriter> outFn;
+
+ RuleVisitor(Function<Path, PrintWriter> outFn) {
+ this.outFn = checkNotNull(outFn);
+ icuData.setFileComment("File: root.txt");
+
+ // I have _no_ idea what any of this is about, I'm just trying to mimic the original
+ // (complex and undocumented) code in "ConvertTransforms.java".
+ icuData.add(RbPath.of("TransliteratorNamePattern"), "{0,choice,0#|1#{1}|2#{1}-{2}}");
+ // Note that this quoting of path segments is almost certainly unnecessary. It matches
+ // the old "ConvertTransforms" behaviour, but '%' is used elsewhere without quoting, so
+ // it seems very likely that it's not needed here.
+ // TODO: Once migration done, remove quotes here & check in RbPath for unwanted quotes.
+ icuData.add(RbPath.of("\"%Translit%Hex\""), "%Translit%Hex");
+ icuData.add(RbPath.of("\"%Translit%UnicodeName\""), "%Translit%UnicodeName");
+ icuData.add(RbPath.of("\"%Translit%UnicodeChar\""), "%Translit%UnicodeChar");
+ // Special case, where Latin is a no-op.
+ icuData.add(RbPath.of("TransliterateLATIN"), RbValue.of("", ""));
+ // Some hard-coded special case mappings.
+ icuData.add(
+ RB_TRANSLITERATOR_IDS.extendBy("Tone-Digit", "alias"),
+ "Pinyin-NumericPinyin");
+ icuData.add(
+ RB_TRANSLITERATOR_IDS.extendBy("Digit-Tone", "alias"),
+ "NumericPinyin-Pinyin");
+ }
+
+ @Override public void visit(CldrValue value) {
+ // The other possible element is "comment" but we currently ignore those.
+ if (TRULE.matches(value.getPath())) {
+ String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE);
+ String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET);
+ Optional<String> variant = TRANSFORM_VARIANT.optionalValueFrom(value);
+ String baseFilename = source + "_" + target;
+ String filename =
+ variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt";
+ writeRootIndexEntry(value, source, target, variant, filename);
+ writeDataFile(filename, value);
+ }
+ }
+
+ private void writeDataFile(String filename, CldrValue value) {
+ try (PrintWriter out = outFn.apply(Paths.get(filename))) {
+ out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
+ out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
+ out.println("#");
+ out.println("# File: " + filename);
+ out.println("# Generated from CLDR");
+ out.println("#");
+ out.println();
+ out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue())));
+ out.println();
+ }
+ }
+
+ private void writeRootIndexEntry(
+ CldrValue value, String source, String target, Optional<String> variant, String filename) {
+ Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class);
+ String status = visibility == Visibility.internal ? "internal" : "file";
+
+ Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class);
+ if (dir != Direction.backward) {
+ String id = getId(source, target, variant);
+ TRANSFORM_ALIAS.listOfValuesFrom(value)
+ .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
+ RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
+ icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
+ icuData.add(rbPrefix.extendBy("direction"), "FORWARD");
+ }
+ if (dir != Direction.forward) {
+ String id = getId(target, source, variant);
+ TRANSFORM_BACKALIAS.listOfValuesFrom(value)
+ .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
+ RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
+ icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
+ icuData.add(rbPrefix.extendBy("direction"), "REVERSE");
+ }
+ }
+ }
+
+ private static String getId(String from, String to, Optional<String> variant) {
+ String baseId = from + "-" + to;
+ return variant.map(v -> baseId + "/" + v).orElse(baseId);
+ }
+
+ private static String getExpectedOptionalAttribute(CldrValue value, AttributeKey key) {
+ return key.optionalValueFrom(value).orElseThrow(() ->
+ new IllegalArgumentException(String.format("missing data for %s in: %s", key, value)));
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import com.google.common.base.Ascii;
+
+/** Instructions in result specifications (e.g. "values=..." or "fallback=..."). */
+enum Instruction {
+ /** Defines processing and transformation of CLDR values. */
+ VALUES,
+ /** Defines fallback values to be used if no result was matched in a resource bundle. */
+ FALLBACK,
+ /** Defines an xpath used to hack result equality to make deduplication work. */
+ BASE_XPATH,
+ // TODO: Figure out how to remove this hack (probably by supporting partial matches).
+ /**
+ * Defines whether result values should be appended one at a time to a resource bundle
+ * (default) or grouped into a separate array.
+ */
+ GROUP;
+
+ /** Returns the instruction enum for its ID as it appears in the configuration file. */
+ static Instruction forId(String id) {
+ return Instruction.valueOf(Ascii.toUpperCase(id));
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.List;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+
+/**
+ * Function used by {@code RegexTransformer} to convert CLDR values in special ways. See also
+ * {@code IcuFunctions}.
+ */
+public final class NamedFunction implements Function<List<String>, String> {
+ private static final CharMatcher NAME_CHARS =
+ CharMatcher.inRange('a', 'z').or(CharMatcher.is('_'));
+ private static final Splitter ARG_SPLITTER = Splitter.on(',').trimResults(whitespace());
+
+ public static NamedFunction create(
+ String name, int argCount, Function<List<String>, String> fn) {
+ return new NamedFunction(name, argCount, fn);
+ }
+
+ private final String name;
+ private final int maxArgs;
+ private final Function<List<String>, String> fn;
+
+ private NamedFunction(String name, int argCount, Function<List<String>, String> fn) {
+ checkArgument(!name.isEmpty() && NAME_CHARS.matchesAllOf(name),
+ "invalid function name (must be lower_case_underscore): %s", name);
+ checkArgument(argCount >= 0, "invalid argument count: %s", argCount);
+ this.name = name;
+ this.maxArgs = argCount;
+ this.fn = checkNotNull(fn);
+ }
+
+ public String call(String argList) {
+ List<String> args = ARG_SPLITTER.splitToList(argList);
+ checkArgument(args.size() <= maxArgs,
+ "too many arguments for function '%s' (max=%s)", name, maxArgs);
+ return checkNotNull(apply(args),
+ "named functions must never return null: function=%s", name);
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String apply(List<String> args) {
+ return fn.apply(args);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap;
+import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
+import static java.util.function.Function.identity;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.ImmutableSetMultimap;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files.
+ * See {@code ldml2icu_readme.txt} for details on the configuration file format and
+ * {@link PathValueTransformer} for the public API description and usage.
+ *
+ * <p>This class is thread safe.
+ */
+// TODO: Rewrite the readme to match current behaviour and describe edge cases properly.
+public final class RegexTransformer extends PathValueTransformer {
+ /**
+ * Returns a new transformer based on transformation rules defined in the given configuration
+ * file contents, and using the specified functions for resolving ICU values.
+ */
+ public static PathValueTransformer fromConfigLines(
+ List<String> lines, NamedFunction... functions) {
+ return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions)));
+ }
+
+ // Map of path prefixes grouped by DTD type (for early efficient filtering of paths).
+ private final ImmutableSetMultimap<CldrDataType, String> prefixMap;
+ // Transformation rules loading from the configuration file, grouped by path prefix.
+ private final ImmutableListMultimap<String, Rule> rulesMap;
+ // Functions which can generate a fallback value from a given resource bundle path.
+ private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions;
+ // Records the total set of rules, removing them as they are matched. Used for reporting any
+ // unused rules for debugging purposes.
+ private final Set<Rule> unusedRules = new LinkedHashSet<>();
+
+ private RegexTransformer(List<Rule> rules) {
+ this.prefixMap =
+ rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix));
+ this.rulesMap =
+ rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity()));
+ this.fallbackFunctions =
+ rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList());
+ // Add all rules first and remove as they are matched.
+ this.unusedRules.addAll(rules);
+ }
+
+ @Override
+ public ImmutableList<Result> transform(CldrValue value) {
+ return transform(value, p -> null);
+ }
+
+ @Override
+ public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) {
+ // This early rejection of non-matching paths, combined with "bucketing" the rules by path
+ // path prefix for easy lookup dramatically reduces the transformation time.
+ String pathPrefix = getPathPrefix(value);
+ if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) {
+ return ImmutableList.of();
+ }
+ // Even though this is just derived from the value, resolve it here and pass it into each
+ // rule to avoid recalculating the same thing every time.
+ String fullXPath = getFullXPathWithoutSortIndices(value);
+ // Bucketing the rules by the path prefix means that each incoming value is only tested
+ // against likely matches. This reduces the number of tests per value by about 10x.
+ for (Rule rule : rulesMap.get(pathPrefix)) {
+ // We break after the first matching rule, since there is an implicit assumption
+ // that no paths will match more than one rule.
+ // TODO: Add a debug mode that checks that only one rule matches any given CLDR path.
+ ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn);
+ if (!results.isEmpty()) {
+ unusedRules.remove(rule);
+ return results;
+ }
+ }
+ return ImmutableList.of();
+ }
+
+ // All "leaf" paths must have at least two elements, so we can find the "prefix" which is
+ // the first element after the DTD root. This corresponds to the value extracted via
+ // PATH_SPEC_PREFIX in the parser.
+ private static String getPathPrefix(CldrValue value) {
+ CldrPath prefix = value.getPath();
+ checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix);
+ while (prefix.getLength() > 2) {
+ prefix = prefix.getParent();
+ }
+ return prefix.getName();
+ }
+
+ // A regex to capture any sort-indices in the full path string (which must be removed).
+ private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+");
+
+ // Note that the full path we get here contains the "sort index" suffix for ORDERED
+ // elements. This means that some element names are "foo#N" where N is the sort index.
+ // Since the regex transformer works around "ordered elements" in a completely different
+ // way and doesn't have them in the regular expressions, we can just remove them.
+ private static String getFullXPathWithoutSortIndices(CldrValue v) {
+ String fullPath = v.getFullPath();
+ for (CldrPath p = v.getPath(); p != null; p = p.getParent()) {
+ if (p.getSortIndex() != -1) {
+ // Only do expensive regex stuff if there's an "ordered" element with a sort index.
+ return SORT_INDEX.matcher(fullPath).replaceAll("$1");
+ }
+ }
+ // No path parts have a sort index, so the original full path string is safe to return.
+ return fullPath;
+ }
+
+ @Override
+ public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) {
+ return fallbackFunctions.stream()
+ .map(f -> f.apply(rbPath, varLookupFn))
+ .filter(Optional::isPresent)
+ .map(Optional::get)
+ .collect(toImmutableList());
+ }
+
+ @Override public String toString() {
+ StringWriter buf = new StringWriter();
+ PrintWriter out = new PrintWriter(buf);
+ out.println(getClass().getName() + "{");
+ out.println(" Rules: " + rulesMap.size());
+ if (!unusedRules.isEmpty()) {
+ out.println(" Unused Rules:");
+ unusedRules.forEach(
+ r -> out.format(" [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec()));
+ }
+ out.println('}');
+ out.flush();
+ return buf.toString();
+ }
+
+ // Package use helper for substituting single-character place-holders like '$N' or '%X'.
+ static String substitute(String s, char token, Function<Character, String> replaceFn) {
+ if (s.indexOf(token) == -1) {
+ return s;
+ }
+ StringBuilder out = new StringBuilder();
+ int i = 0;
+ for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) {
+ char varChar = s.charAt(j + 1);
+ String replacement =
+ checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar);
+ out.append(s, i, j).append(replacement);
+ }
+ return out.append(s.substring(i)).toString();
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkElementIndex;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+import static java.util.Comparator.comparing;
+import static java.util.Comparator.nullsLast;
+import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/**
+ * A specification for building a result from the arguments in a matched xpath. Results always
+ * hold a reference to their originating specification to allow them to be ordered in the same
+ * order as the corresponding specifications in the configuration file.
+ */
+final class ResultSpec {
+ // Subtle ordering for results to ensure "config file order" for things in the same
+ // resource bundle while being "friendly" towards a global ordering. This is NOT consistent
+ // with equals if duplicate results exist.
+ //
+ // This is ESSENTIAL for correct grouping and ordering within resource bundles.
+ //
+ // In normal use this is expected only to be used to reorder results within a resource
+ // bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles
+ // themselves can just be managed in "visitation order" or similar.
+ //
+ // Ordering priority is:
+ // 1: Result key (resource bundle): Groups results by resource bundle.
+ // 2: Result specification line number: Orders resource bundle contents by "file order".
+ // 3: Result distinguishing xpath: Tie breaking if duplicates are not yet removed.
+ //
+ // Note that the currently uses the String representation of the resource bundle path (key)
+ // as the primary order to match legacy behaviour. However it would be better to use the
+ // natural lexicographical RbPath order (the difference relates to having '/' as the
+ // separator in the string representation of the path). The string form of a path is a bad
+ // choice because some paths can contain a literal '/', which makes ordering problematic in
+ // rare case. However changing this will have the effect of reodering path elements, which
+ // while it should be safe, must be done with caution.
+ // TODO: Fix this to use RbPath ordering and NOT the String representation
+ private static final Comparator<AbstractResult> RESULT_ORDERING =
+ Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString())
+ .thenComparing(r -> r.getSpec().lineNumber)
+ .thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null))));
+
+ // Splitter for any values (either in CLDR data or results specifications). The only time
+ // values are split differently is when quoting exists in the "values" instruction.
+ private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings();
+
+ // Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list.
+ private static final Pattern FUNCTION = Pattern.compile("\\&(\\w++)\\(([^\\)]++)\\)");
+
+ // Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it
+ // appears in the configuration file.
+ private final String rbPathSpec;
+
+ // Declared instructions with which to generate result values (see Instruction).
+ private final ImmutableMap<Instruction, VarString> instructions;
+
+ // This index of the xpath argument whose value should be split to create multiple results.
+ // This mechanism is used when an xpath attribute is a space separated list of values and
+ // one result should be created for each value (e.g. [@territories="AA BB CC"] but you want
+ // a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar").
+ // At most one argument is ever split (corresponding to the first unquoted placeholder in
+ // the resource bundle path specification).
+ private final int splitArgIndex;
+
+ // The line number of the result specification in the file which defines the ordering of
+ // results within a resource bundle. This needn't be a line number, but must be unique for
+ // each specification.
+ private final int lineNumber;
+
+ // The named functions available to the parser. Ideally the rules and result specifications
+ // would be an inner class of some kind of context/environment and just share this.
+ private final ImmutableMap<String, NamedFunction> icuFunctions;
+
+ // The map of dynamic variables (looked up from CldrPaths when a rule is resolved.
+ private final Function<Character, CldrPath> dynamicVarFn;
+
+ ResultSpec(
+ String rbPathSpec,
+ Map<Instruction, VarString> instructions,
+ int lineNumber,
+ Map<String, NamedFunction> icuFunctions,
+ Function<Character, CldrPath> dynamicVarFn) {
+ this.rbPathSpec = checkNotNull(rbPathSpec);
+ this.instructions = ImmutableMap.copyOf(instructions);
+ this.splitArgIndex = getSplitArgIndex(rbPathSpec);
+ this.lineNumber = lineNumber;
+ this.icuFunctions = ImmutableMap.copyOf(icuFunctions);
+ this.dynamicVarFn = checkNotNull(dynamicVarFn);
+ }
+
+ /**
+ * Transforms a path/value into a sequence of results. The given matcher has successfully
+ * matched the path and contains the captured arguments corresponding to $1..$N in the
+ * various result specification strings.
+ */
+ Stream<Result> transform(
+ CldrValue value, Matcher m, DynamicVars varLookupFn) {
+ // Discard group(0) since that's always the full xpath that was matched, and we don't
+ // need that any more (so "$N" is args.get(N - 1)).
+ List<String> args = new ArrayList<>();
+ for (int i = 1; i <= m.groupCount(); i++) {
+ // Important since we turn this into an ImmutableList (which is null-hostile).
+ args.add(checkNotNull(m.group(i),
+ "captured regex arguments must always be present\n"
+ + "(use an non-capturing groups for optional arguments): %s", m.pattern()));
+ }
+
+ // The first unquoted argument in any resource bundle path declaration, is defined as
+ // being "splittable". Typically this happens if the value of the captured xpath
+ // argument is expected to be a list of items.
+ //
+ // In this case, we generate one result for each individual argument, replacing the
+ // appropriate captured list with each split value in turn. Thus with original
+ // arguments:
+ // ["foo", "bar baz", "quux"]
+ // where splitArgIndex == 1, we get two results using the argument lists:
+ // ["foo", "bar", "quux"]
+ // ["foo", "baz", "quux"]
+ //
+ // Note also that since the splittability of the arguments is technically defined
+ // by the resource bundle path specification (not the xpath regular expression) it
+ // could differ per ResultSpec instance (but currently never does).
+ if (splitArgIndex != -1) {
+ List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex));
+ // Only bother if there was more than one argument there anyway.
+ if (splitArgs.size() > 1) {
+ return splitArgs.stream().map(a -> {
+ args.set(splitArgIndex, a);
+ return matchedResult(value, args, varLookupFn);
+ });
+ }
+ }
+ // No splittable argument, or a splittable argument with only one value.
+ return Stream.of(matchedResult(value, args, varLookupFn));
+ }
+
+ // Simple helper to make results.
+ private Result matchedResult(
+ CldrValue value, List<String> args, DynamicVars varLookupFn) {
+ return new MatchedResult(
+ getRbPath(args),
+ getValues(value.getValue(), args, varLookupFn),
+ getResultPath(value.getPath(), args, varLookupFn));
+ }
+
+ // Resource bundle paths are a bit special (unsurprisingly). The captured arguments can
+ // contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as
+ // "foo/x/y/bar" after argument substitution.
+ //
+ // However (a hack for timezone "metazone" paths) if the argument placeholder is quoted
+ // (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained
+ // (e.g. "foo/"x:y"/bar).
+ // TODO: Replace hard coded hack here with an explicit function in the config file.
+ private RbPath getRbPath(List<String> args) {
+ // Without more careful parsing, it's hard to figure out it quotes in a resource bundle
+ // path specification are around a placeholder or not. Since quotes are only used in a
+ // small number of cases currently, and only for this purpose, we just assume that any
+ // quotes in the path specification should trigger this behaviour.
+ if (rbPathSpec.contains("\"")) {
+ // Use a lazy transforming list to avoid char replacement in arguments that don't
+ // appear in the resource bundle path.
+ args = Lists.transform(args, s -> s.replace('/', ':'));
+ }
+ String path = substituteArgs(rbPathSpec, args);
+ return RbPath.parse(path);
+ }
+
+ // Create an array of output values according to the CLDR value (if present) and the
+ // "values" instruction in the result specification (if present). Any functions present in
+ // the "values" instruction are invoked here.
+ private ImmutableList<String> getValues(
+ String value, List<String> args, DynamicVars varLookupFn) {
+ VarString valuesSpec = instructions.get(Instruction.VALUES);
+ if (valuesSpec == null) {
+ // No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR
+ // value use "values={value}" in the result specification.
+ return ImmutableList.of(value);
+ }
+ // The "value" instruction is not expected to have any dynamic %N variables in it,
+ // since those only represent CLDR path mappings, which should not be directly present
+ // in the ICU data. Hence the valueSpec should have been fully resolved by the static
+ // variables applied earlier and we should just need to resolve() it into a String.
+ String resolved = valuesSpec.get();
+
+ // First substitute the $N arguments in since they need to be passed to the
+ // functions.
+ //
+ // WARNING: This doesn't strictly work, since an argument or function result could
+ // (in theory) contain the string "{value}" which would then be substituted in an
+ // unexpected way. The better way to do this is with a single pass which handles
+ // arguments, function calling and the special "{value}" token together. This comes
+ // down to the fact that the mapping file syntax doesn't have a well defined concept
+ // of escaping or invocation order.
+ // TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent.
+ resolved = substituteArgs(resolved, args);
+
+ Matcher m = FUNCTION.matcher(resolved);
+ if (m.find()) {
+ StringBuilder buffer = new StringBuilder();
+ int index = 0;
+ do {
+ // Append up to the start of the function call.
+ buffer.append(resolved, index, m.start());
+
+ // Replace '{value}' here so functions can be called with the CLDR value as well
+ // as captured path arguments. We also have to replace it below, which is all a bit
+ // dodgy if a function every returned '{value}'.
+ NamedFunction fn = icuFunctions.get(m.group(1));
+ checkArgument(fn != null, "no such function: %s", m.group(1));
+ buffer.append(fn.call(m.group(2).replace("{value}", value)));
+ index = m.end();
+ } while (m.find());
+ resolved = buffer.append(resolved.substring(index)).toString();
+ }
+ // Having done function invocation, we handle the special "{value}" token and split
+ // the value (taking quoting into account).
+ return splitValues(resolved.replace("{value}", value));
+ }
+
+ // IMPORTANT: The path of a result is either:
+ // * The original distinguishing path
+ // * The specified "base_xpath" (which must also be a distinguishing xpath).
+ // and this is used as part of the equality semantics (which are very subtle).
+ //
+ // The existence of "base_xpath" is a hack to get around the fact the xpaths can only be
+ // matched in full, rather than by a prefix. For some cases this means that the "same"
+ // result will be created many times by potentially different distinguishing xpaths,
+ // perhaps even via different result specifications. "base_xpath" exists as a hack to give
+ // these duplicate results the same "fake" xpath, so deduplication can occur.
+ private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) {
+ VarString basePath = instructions.get(Instruction.BASE_XPATH);
+ if (basePath == null) {
+ return path;
+ }
+ String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get();
+ return parseDistinguishingPath(substituteArgs(resolvedBasePath, args));
+ }
+
+ /**
+ * Returns a fallback function if this specification has the "fallback=" instruction.
+ * The function takes a resolved resource bundle path and returns the possible fallback
+ * values for it. Note that currently fallback values do not support either quoting or
+ * grouping (but they easily could).
+ */
+ Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() {
+ VarString fallbackSpec = instructions.get(Instruction.FALLBACK);
+ if (fallbackSpec == null) {
+ return Optional.empty();
+ }
+ // This is the only place where any hacking of regular expressions occurs. The fallback
+ // function must only return a value if the given resolved resource bundle path could
+ // have been a match for the path specification.
+ //
+ // In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which
+ // should not both be matched, we explicitly disallow '/' in argument values. In theory
+ // this is problematic, since '/' should be an allowed character, but the issues caused
+ // by ambiguous matching are worse.
+ // TODO: Fix/replace all of this fallback mess with something cleaner.
+ Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec);
+
+ // Another, frankly terrifying, bit of hackery to support fallback specifications with
+ // $N argument substitution (this currently only happens once, but must be supported).
+ // Just another reason to want to replace the current fallback mechanism.
+ fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec);
+
+ // Just copying here to make it effectively final.
+ VarString finalFallbackSpec = fallbackSpec;
+ return Optional.of(
+ (p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec));
+ }
+
+ private Optional<Result> getFallbackResult(
+ RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) {
+ // Check is the given rbPath could be associated with this fallback (most are not).
+ Matcher matcher = rbPathMatcher.matcher(rbPath.toString());
+ if (!matcher.matches()) {
+ return Optional.empty();
+ }
+ // Expect that once any dynamic variables are provided to the fallback specification,
+ // we can get the resolved fallback specification (potentially with $N placeholders to
+ // be filled in from the resource bundle path).
+ String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get();
+ if (matcher.groupCount() > 0) {
+ specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount());
+ }
+
+ // Split the fallback value _without_ considering quoting. This matches the original
+ // behaviour but could cause all sorts of subtle issues if values contained quotes.
+ // TODO: Rework transformation rules to make quoting behaviour deterministic.
+ Iterable<String> values =
+ VALUE_SPLITTER.splitToList(specStr).stream()
+ // Fallback values that "look like" CLDR paths are auto-magically resolved.
+ .map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v)
+ .collect(toImmutableList());
+ return Optional.of(new FallbackResult(rbPath, values));
+ }
+
+ // WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument
+ // substitutions are allowed in fallback values. This is highly problematic because
+ // since the fallback value must be synthesized only from the resource bundle path,
+ // there's no way for this substitution to handle:
+ // 1: multi-valued list arguments
+ // 2: arguments that didn't appear in the resource bundle path
+ // 3: dynamic path variables (e.g. %D=//some/path)
+ //
+ // An example would be something like a resource bundle specification of:
+ // /Baz/$2/$1
+ // and a fallback value of:
+ // Foo$1/Bar$2
+ //
+ // Here the order of substitution is not maintained and the original path specification
+ // has values that are not naturally ordered (or possibly even duplicated). The pattern
+ // we calculate from the resource bundle path specification will match/capture groups in
+ // "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the
+ // placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1").
+ // TODO: Figure out a way to remove all of this extreme complexity.
+ private VarString maybeRewriteFallbackSpec(
+ VarString fallbackSpec) {
+ Optional<String> fallback = fallbackSpec.resolve();
+ // If the fallback string is not present, it's because the VarString still has
+ // unresolved "dynamic" variables for late binding. This is okay, but should not
+ // be mixed with argument substitution.
+ if (!fallback.isPresent() || !fallback.get().contains("$")) {
+ return fallbackSpec;
+ }
+ // After the quick rejection check for '$', do a proper search for $N variables (since
+ // '$' is permitted as a literal if not followed by a digit).
+ Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get());
+ if (!fallbackMatcher.find()) {
+ return fallbackSpec;
+ }
+
+ // Fallback spec has $N in it, triggering super hacky behaviour.
+ Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec);
+ checkState(pathMatcher.find(),
+ "$N arguments in fallback must be present in the resource bundle path: %s",
+ rbPathSpec);
+ // Explicit group characters ("1"..."9") in the order they appear in the
+ // resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1").
+ List<Character> groupIds = new ArrayList<>();
+ do {
+ groupIds.add(pathMatcher.group().charAt(1));
+ } while (pathMatcher.find());
+
+ // Special check to avoid a horrible bug if we every had more than 9 distinct
+ // placeholders (essentially impossible with current data). If it did happen,
+ // the returned index below would be >= 9 and we would get "$X", where 'X' was
+ // not a numeric value.
+ checkState(groupIds.size() < 10,
+ "too many placeholders in resource bundle path: %s", rbPathSpec);
+
+ // Now find each placeholder in the fallback specification string and map it to
+ // the equivalent index for the path matcher we just created.
+ StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get());
+ do {
+ int placeholderPos = fallbackMatcher.start() + 1;
+ // The new ID is the index of the corresponding placeholder offset by '1'.
+ char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos);
+ int newPlaceholderIndex = groupIds.indexOf(placeholderDigit);
+ checkState(newPlaceholderIndex != -1,
+ "fallback values may only contain arguments from the resource bundle path: %s",
+ fallback.get());
+ rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex));
+ } while (fallbackMatcher.find());
+ return VarString.of(rewrittenFallbackSpec.toString());
+ }
+
+ /** Base class of either a matched or a fallback result. */
+ private abstract class AbstractResult extends Result {
+ // Split and resolved values for this result (see also "isGrouped()").
+ private final ImmutableList<String> values;
+
+ // The "source" CLDR path of a matched result (omitted if this is a fallback result).
+ // Note that this is the resolved "base_xpath" if it was specified in the instructions.
+ private final Optional<CldrPath> basePath;
+
+ // Calculated eagerly since we always expect results to need to be deduplicated.
+ private final int hashCode;
+
+ AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) {
+ super(key);
+ this.values = ImmutableList.copyOf(values);
+ this.basePath = checkNotNull(path);
+ // Same attributes in the same order as tested for in equals().
+ this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues());
+ }
+
+ // Returns the specification from which this result was obtained. This is essential for
+ // correct ordering and determining fallback values, but is not directly used for
+ // determining result equality (since duplicate results can be generated by different
+ // specifications).
+ final ResultSpec getSpec() {
+ return ResultSpec.this;
+ }
+
+ final Optional<CldrPath> getPath() {
+ return basePath;
+ }
+
+ final boolean wasMatched() {
+ // We could also do this via a boolean field.
+ return this instanceof MatchedResult;
+ }
+
+ @Override
+ public final ImmutableList<String> getValues() {
+ return values;
+ }
+
+ @Override
+ public final int compareTo(Result other) {
+ checkArgument(other instanceof AbstractResult,
+ "unknown result type: %s", other.getClass());
+ return RESULT_ORDERING.compare(this, (AbstractResult) other);
+ }
+
+ @Override
+ public final int hashCode() {
+ return hashCode;
+ }
+
+ // Equality semantics of results is ESSENTIAL for correct behaviour, especially the
+ // deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING.
+ @Override
+ public final boolean equals(Object obj) {
+ // Different subclasses are never equal, so test class directly (not instanceof).
+ if (obj == null || !getClass().equals(obj.getClass())) {
+ return false;
+ }
+ AbstractResult other = (AbstractResult) obj;
+ // DO NOT test the result specifier here. Equal results can be generated from
+ // different result specifications (if "base_xpath" was used).
+ return getKey().equals(other.getKey())
+ && getPath().equals(other.getPath())
+ && isGrouped() == other.isGrouped()
+ // Alternatively assert that values are equal if everything else is.
+ && getValues().equals(other.getValues());
+ }
+ }
+
+ // Result created for an explicit path match using captured arguments.
+ private final class MatchedResult extends AbstractResult {
+ MatchedResult(RbPath key, Iterable<String> values, CldrPath path) {
+ super(key, values, Optional.of(path));
+ }
+
+ @Override
+ public boolean isGrouped() {
+ // We don't need to use the "group" value at all and it can be removed from the
+ // configuration file at some point.
+ return instructions.containsKey(Instruction.GROUP);
+ }
+
+ @Override
+ public boolean isFallbackFor(Result r) {
+ // Matched results are never a fallback for anything.
+ return false;
+ }
+ }
+
+ // Result created to hold possible fallback values for a specified resource bundle path.
+ private final class FallbackResult extends AbstractResult {
+ FallbackResult(RbPath rbPath, Iterable<String> values) {
+ super(rbPath, values, Optional.empty());
+ }
+
+ // Delete this method and move the other one into AbstractResult if we decide to allow
+ // grouping for fallback values (it's not clear if it's a good idea).
+ @Override
+ public boolean isGrouped() {
+ return false;
+ }
+
+ @Override
+ public boolean isFallbackFor(Result r) {
+ // We are a fallback if we came from the same specification as a matched result.
+ // To prevent duplication of fallback results, we also return true if the result we
+ // are "equal()" to the given result (equivalent fallback results can come from
+ // different input paths).
+ checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r);
+ AbstractResult result = (AbstractResult) r;
+ return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result);
+ }
+ }
+
+ // ==== Static helper functions ====
+
+ // Matches any "$N" placeholder without capturing.
+ private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]");
+
+ // Turn "$N" into a capturing groups.
+ //
+ // Note that this code currently assumes that each "$N" placeholder matches a single path
+ // segment (i.e. the captured values cannot contain '/'). This is an artificial restriction
+ // since resource bundle paths can have quoting in, so we could detect quoted placeholders
+ // and allow any characters. However at the moment this isn't an issue, and none of the
+ // "$N" placeholders in the paths expects to match anything with '/' in.
+ //
+ // TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly.
+ private static Pattern getRbPathMatcher(String rbPathSpec) {
+ // An RbPath instance's toString() does not have a leading '/' on it, so well have to
+ // account for that here (or we could just remove the leading '/' from paths in the
+ // config file...
+ if (rbPathSpec.startsWith("/")) {
+ rbPathSpec = rbPathSpec.substring(1);
+ }
+ // Protect potential regex meta-characters in the original resource bundle path. Using
+ // '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that
+ // means we also need to handle '\E' in the original string (incredibly unlikely but it
+ // would be super hard to debug if it ever happened).
+ // TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify.
+ String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E";
+
+ // Remember that you could get "$1$2" here and the regex groups that replace them will
+ // abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case.
+ // We assume that the substituted arguments contained at least one character, and so we
+ // capture at least one character per group here.
+ regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q");
+ return Pattern.compile(regex);
+ }
+
+ private static String substituteArgs(String spec, List<String> args) {
+ return substituteArgs(spec, args::get, args.size());
+ }
+
+ // Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed
+ // function (i.e. "$N" --> args(N - 1)).
+ private static String substituteArgs(String spec, Function<Integer, String> args, int size) {
+ return RegexTransformer.substitute(
+ spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index")));
+ }
+
+ // Matches arguments with or without enclosing quotes.
+ private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?");
+
+ // Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9)
+ // and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is
+ // captured by the regex because it's just the entire path.
+ private static int getSplitArgIndex(String rbPath) {
+ // Captures a $N placeholder, but might catch surrounding quoting as well.
+ Matcher matcher = ARGUMENT.matcher(rbPath);
+ while (matcher.find()) {
+ char startChar = rbPath.charAt(matcher.start());
+ char endChar = rbPath.charAt(matcher.end() - 1);
+ // Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N".
+ // Q: Why two different "quoting" schemes?
+ // A: It's complex and relates the something called "hidden labels".
+ boolean shouldSplit = !((startChar == '"' && endChar == '"') ||
+ (startChar == '<' && endChar == '>'));
+ if (shouldSplit) {
+ // Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and
+ // arguments are zero-indexed, so we expect an index from 0 to 8.
+ int groupNumber = Integer.parseInt(matcher.group(1));
+ checkArgument(groupNumber >= 1 && groupNumber <= 9,
+ "invalid split argument: %s", groupNumber);
+ return groupNumber - 1;
+ }
+ }
+ return -1;
+ }
+
+ // Splits a possibly quoted string, where we need to handle \". This is a bit dubious
+ // though as we don't detect or unescape \\. Thus it's impossible to represent a single '\'
+ // at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'.
+ // It's also impossible to have a value that should be split but which contains '"'.
+ //
+ // This mimics the original RegexManager behaviour where spaces in and quotes in
+ // substituted values are _not_ escaped.
+ private static ImmutableList<String> splitValues(String value) {
+ int qstart = nextBareQuoteIndex(value, 0);
+ if (qstart == -1) {
+ return ImmutableList.copyOf(VALUE_SPLITTER.split(value));
+ }
+ ImmutableList.Builder<String> values = ImmutableList.builder();
+ int rawStart = 0;
+ do {
+ values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart)));
+ int qend = nextBareQuoteIndex(value, qstart + 1);
+ checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value);
+ // Remember to unescape any '"' found in the quoted regions.
+ values.add(value.substring(qstart + 1, qend).replace("\\\"", "\""));
+ rawStart = qend + 1;
+ qstart = nextBareQuoteIndex(value, qend + 1);
+ } while (qstart != -1);
+ values.addAll(VALUE_SPLITTER.split(value.substring(rawStart)));
+ return values.build();
+ }
+
+ // Returns the index of the next '"' character that's not preceded by a '\'.
+ private static int nextBareQuoteIndex(String s, int i) {
+ i = s.indexOf('"', i);
+ // If i == 0, then '"' is the first char and must be "bare".
+ if (i > 0) {
+ do {
+ if (s.charAt(i - 1) != '\\') {
+ break;
+ }
+ i = s.indexOf('\\', i + 1);
+ } while (i >= 0);
+ }
+ return i;
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.ImmutableList.toImmutableList;
+
+import java.util.Optional;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+
+import com.google.common.collect.ImmutableList;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/*
+ * Each rule corresponds to a single target xpath specification in the configuration file
+ * (lines starting //) but may have more than one result specification. For example:
+ *
+ * //supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
+ * ; /languageData/$1/primary/scripts ; values=$2
+ * ; /languageData/$1/primary/territories; values=$3
+ *
+ * is represented by a single rule with two result specifications.
+ */
+abstract class Rule {
+ /** Returns a rule for which all '%X' arguments have been resolved (almost all cases). */
+ static Rule staticRule(
+ CldrDataType dtdType,
+ String prefix,
+ Iterable<ResultSpec> specs,
+ String pathRegex,
+ String xpathSpec,
+ int lineNumber) {
+
+ return new StaticRule(dtdType, prefix, specs, pathRegex, xpathSpec, lineNumber);
+ }
+
+ /** Returns a rule for which some '%X' arguments are unresolved until matching occurs. */
+ static Rule dynamicRule(
+ CldrDataType dtdType,
+ String pathRegex,
+ Iterable<ResultSpec> specs,
+ VarString varString,
+ Function<Character, CldrPath> varFn,
+ String xpathSpec,
+ int lineNumber) {
+
+ return new DynamicRule(dtdType, pathRegex, specs, varString, varFn, xpathSpec, lineNumber);
+ }
+
+ // Type of CLDR path which can match this rule.
+ private final CldrDataType dtdType;
+ // The first path element below the root, used to do fast rejection of non-matching paths
+ // and to "bucket" rules by their prefix to speed up matching.
+ private final String pathPrefix;
+ // One or more result specifications to be processed for matching CLDR paths/values.
+ private final ImmutableList<ResultSpec> resultSpecs;
+ // Debug information only to help determine unused rules.
+ private final String xpathSpec;
+ private final int lineNumber;
+
+ private Rule(
+ CldrDataType dtdType,
+ String pathPrefix,
+ Iterable<ResultSpec> resultSpecs,
+ String xpathSpec,
+ int lineNumber) {
+
+ this.dtdType = checkNotNull(dtdType);
+ this.pathPrefix = checkNotNull(pathPrefix);
+ this.resultSpecs = ImmutableList.copyOf(resultSpecs);
+ this.xpathSpec = checkNotNull(xpathSpec);
+ this.lineNumber = lineNumber;
+ }
+
+ /** Returns the CLDR DTD type of the path that the rule can match. */
+ final CldrDataType getDataType() {
+ return dtdType;
+ }
+
+ /** Returns the name of the first path element below the path root. */
+ final String getPathPrefix() {
+ return pathPrefix;
+ }
+
+ /** Returns the regular expression against which CLDR path strings are matched. */
+ abstract Pattern getPathPattern(DynamicVars varLookupFn);
+
+ /**
+ * Attempts to match the incoming xpath and (if successful) use captured arguments to
+ * generate one result for each result specification.
+ */
+ final ImmutableList<Result> transform(CldrValue v, String fullXPath, DynamicVars varFn) {
+ Matcher m = getPathPattern(varFn).matcher(fullXPath);
+ return m.matches()
+ ? resultSpecs.stream()
+ .flatMap(r -> r.transform(v, m, varFn))
+ .collect(toImmutableList())
+ : ImmutableList.of();
+ }
+
+ /**
+ * Returns any fallback functions defined in results specifications. These are used to
+ * determine the set of possible fallback values for a given resource bundle path.
+ */
+ final Stream<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunctions() {
+ return resultSpecs.stream()
+ .map(ResultSpec::getFallbackFunction)
+ .filter(Optional::isPresent)
+ .map(Optional::get);
+ }
+
+ // Debugging only
+ final String getXpathSpec() {
+ return xpathSpec;
+ }
+
+ // Debugging only
+ final int getLineNumber() {
+ return lineNumber;
+ }
+
+ private static final class StaticRule extends Rule {
+ // The processed xpath specification yielding an xpath matching regular expression. This is
+ // only suitable for matching incoming xpaths and cannot be processed in any other way.
+ private final Pattern xpathPattern;
+
+ StaticRule(
+ CldrDataType dtdType,
+ String prefix,
+ Iterable<ResultSpec> specs,
+ String pathRegex,
+ String xpathSpec,
+ int lineNumber) {
+
+ super(dtdType, prefix, specs, xpathSpec, lineNumber);
+ this.xpathPattern = Pattern.compile(pathRegex);
+ }
+
+ @Override
+ Pattern getPathPattern(DynamicVars varLookupFn) {
+ return xpathPattern;
+ }
+ }
+
+ private static final class DynamicRule extends Rule {
+ // The processed xpath specification yielding an xpath matching regular expression. This is
+ // only suitable for matching incoming xpaths and cannot be processed in any other way.
+ private final VarString varString;
+ private final Function<Character, CldrPath> dynamicVarFn;
+
+ DynamicRule(
+ CldrDataType dtdType,
+ String prefix,
+ Iterable<ResultSpec> specs,
+ VarString varString,
+ Function<Character, CldrPath> varFn,
+ String xpathSpec,
+ int lineNumber) {
+
+ super(dtdType, prefix, specs, xpathSpec, lineNumber);
+ this.varString = checkNotNull(varString);
+ this.dynamicVarFn = checkNotNull(varFn);
+ }
+
+ @Override Pattern getPathPattern(DynamicVars varLookupFn) {
+ String pathRegex = varString.apply(dynamicVarFn.andThen(varLookupFn)).get();
+ return Pattern.compile(pathRegex);
+ }
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.CharMatcher.whitespace;
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.collect.ImmutableMap.toImmutableMap;
+import static com.google.common.collect.Maps.filterValues;
+import static com.google.common.collect.Maps.transformValues;
+import static java.util.function.Function.identity;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.unicode.cldr.api.CldrDataType;
+import org.unicode.cldr.api.CldrPath;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.escape.CharEscaperBuilder;
+import com.google.common.escape.Escaper;
+
+/** Parser for rule specifications in the regex transformer configuration files. */
+final class RuleParser {
+ // Pattern to capture first two path elements (for the dtd type and path prefix).
+ private static final Pattern PATH_SPEC_PREFIX = Pattern.compile("//([^/]+)/([^/]+)/");
+
+ // Preprocessing replaces %X variables defined in the configuration file. This helps to
+ // keep the path specification a bit easier to read.
+ private static final Pattern VAR = Pattern.compile("^%([A-Z])=(.*)$");
+
+ // Multi-line rules start with " ; " for some optional amount of whitespace.
+ private static final Pattern RULE_PARTS_SEPERATOR = Pattern.compile("\\s*+;\\s*+");
+
+ // Splitter for the resource bundle / value declarations.
+ private static final Splitter RULE_PARTS_SPLITTER =
+ Splitter.on(RULE_PARTS_SEPERATOR).trimResults(whitespace()).omitEmptyStrings();
+
+ // Splitter for instruction name/expressions.
+ private static final Splitter INSTRUCTION_SPLITTER =
+ Splitter.on('=').trimResults(whitespace()).limit(2);
+
+ // Only '[',']' need escaping in path specifications (so we can write "foo{@bar="baz"]").
+ private static final Escaper SPECIAL_CHARS_ESCAPER =
+ new CharEscaperBuilder().addEscape('[', "\\[").addEscape(']', "\\]").toEscaper();
+
+ /** Parses a configuration file to create a sequence of transformation rules. */
+ static ImmutableList<Rule> parseConfig(
+ List<String> configLines, List<NamedFunction> functions) {
+ // Extract '%X' variable declarations in the first pass.
+ ImmutableMap<Character, String> varMap = configLines.stream()
+ .filter(s -> s.startsWith("%"))
+ .map(VAR::matcher)
+ .peek(m -> checkArgument(m.matches(), "invalid argument declaration: %s", m))
+ .collect(ImmutableMap.toImmutableMap(m -> m.group(1).charAt(0), m -> m.group(2)));
+ return new RuleParser(varMap, functions).parseLines(configLines);
+ }
+
+ private final ImmutableMap<Character, String> staticVarMap;
+ private final ImmutableMap<Character, CldrPath> dynamicVarMap;
+ private final ImmutableMap<String, NamedFunction> fnMap;
+
+ private RuleParser(ImmutableMap<Character, String> varMap, List<NamedFunction> functions) {
+ this.staticVarMap = ImmutableMap.copyOf(filterValues(varMap, s -> !s.startsWith("//")));
+ this.dynamicVarMap = ImmutableMap.copyOf(
+ transformValues(
+ filterValues(varMap, s -> s.startsWith("//")),
+ CldrPath::parseDistinguishingPath));
+ this.fnMap =
+ functions.stream().collect(toImmutableMap(NamedFunction::getName, identity()));
+ }
+
+ private ImmutableList<Rule> parseLines(List<String> configLines) {
+ List<Rule> rules = new ArrayList<>();
+ for (int lineIndex = 0; lineIndex < configLines.size(); lineIndex++) {
+ String line = configLines.get(lineIndex);
+ try {
+ if (line.startsWith("//")) {
+ // Either it's "//xpath ; resource-bundle-path ; values"
+ // Or "//xpath" with " ; resource-bundle-path ; values" on subsequent lines.
+ int ruleLineNumber = lineIndex + 1;
+ int xpathEnd = line.indexOf(";");
+ String xpath;
+ List<ResultSpec> specs = new ArrayList<>();
+ if (xpathEnd != -1) {
+ // Single line rule, extract result specification from trailing part.
+ xpath = whitespace().trimFrom(line.substring(0, xpathEnd));
+ // Keep leading " ; " in the transformation string since it matches the
+ // multi-rule case and is handled the same.
+ specs.add(parseResultSpec(line.substring(xpathEnd), lineIndex + 1));
+ } else {
+ xpath = line;
+ while (++lineIndex < configLines.size()
+ && RULE_PARTS_SEPERATOR.matcher(configLines.get(lineIndex)).lookingAt()) {
+ specs.add(parseResultSpec(configLines.get(lineIndex), lineIndex + 1));
+ }
+ // The loop above moved us past the last line of the rule, so readjust.
+ lineIndex--;
+ }
+ rules.add(parseRule(xpath, specs, ruleLineNumber));
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(
+ String.format("parse error at line %d: %s", lineIndex + 1, line), e);
+ }
+ }
+ return ImmutableList.copyOf(rules);
+ }
+
+ private ResultSpec parseResultSpec(String spec, int lineNumber) {
+ // The result specifier still has leading separator (e.g. " ; /foo/bar/$1 ; value=$2"),
+ // but that's okay because the splitter ignores empty results.
+ List<String> rbPathAndInstructions = RULE_PARTS_SPLITTER.splitToList(spec);
+ String rbPathSpec = rbPathAndInstructions.get(0);
+
+ ImmutableMap<Instruction, VarString> instructions =
+ rbPathAndInstructions.stream()
+ .skip(1)
+ .map(INSTRUCTION_SPLITTER::splitToList)
+ .collect(toImmutableMap(
+ p -> Instruction.forId(p.get(0)),
+ p -> VarString.of(p.size() > 1 ? p.get(1) : "", staticVarMap::get)));
+ return new ResultSpec(rbPathSpec, instructions, lineNumber, fnMap, dynamicVarMap::get);
+ }
+
+ private Rule parseRule(String xpathSpec, List<ResultSpec> resultSpecs, int lineNumber) {
+ // The escaped path is nearly a regular expression, but still contains '%X' variables.
+ String escapedPathSpec = SPECIAL_CHARS_ESCAPER.escape(xpathSpec);
+ Matcher m = PATH_SPEC_PREFIX.matcher(escapedPathSpec);
+ checkArgument(m.lookingAt(), "unexpected path spec: %s", escapedPathSpec);
+
+ // Extract type a path prefix for rule grouping and fast rejection during matching.
+ CldrDataType dtdType = CldrDataType.forXmlName(m.group(1));
+ String pathPrefix = m.group(2);
+
+ // If the variable string contains a "dynamic" argument, is cannot be resolved yet and
+ // must result in a "dynamic" rule being created here (this is very rare though).
+ VarString varString = VarString.of(escapedPathSpec, staticVarMap::get);
+ Optional<String> resolved = varString.resolve();
+ // Don't turn this into a "map().orElse()" chain (despite what your IDE might suggest)
+ // because we don't want to create lots of unused dynamic rules!
+ return resolved.isPresent()
+ ? Rule.staticRule(
+ dtdType, pathPrefix, resultSpecs, resolved.get(), xpathSpec, lineNumber)
+ : Rule.dynamicRule(
+ dtdType, pathPrefix, resultSpecs, varString, dynamicVarMap::get, xpathSpec, lineNumber);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.Optional;
+import java.util.function.Function;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * An immutable representation of a String with placeholders for variable substitution. A
+ * VarString can be "resolved" or "partially resolved" by providing a mapping from placeholder
+ * characters to strings, and any remaining unresolved variables are tracked. This is a very
+ * private bit of implementation detail with a far from ideal API, so it's probably best not to
+ * use it elsewhere without careful thought.
+ */
+final class VarString {
+ private static final CharMatcher VAR_CHAR = CharMatcher.inRange('A', 'Z');
+
+ static VarString of(String varString) {
+ ImmutableSet.Builder<Character> requiredChars = ImmutableSet.builder();
+ // Variable placeholders are any % followed by upper-case ASCII letter (A-Z).
+ // Other '%' chars are ignored.
+ for (int i = 0; i < varString.length() - 1; i++) {
+ if (varString.charAt(i) == '%') {
+ char c = varString.charAt(i + 1);
+ if (VAR_CHAR.matches(c)) {
+ requiredChars.add(c);
+ }
+ }
+ }
+ return new VarString(varString, requiredChars.build(), ImmutableMap.of());
+ }
+
+ static VarString of(String s, Function<Character, String> varFn) {
+ return of(s).apply(varFn);
+ }
+
+ private final String varString;
+ private final ImmutableSet<Character> requiredChars;
+ private final ImmutableMap<Character, String> varMap;
+
+ private VarString(
+ String varString,
+ ImmutableSet<Character> requiredChars,
+ ImmutableMap<Character, String> varMap) {
+ this.varString = checkNotNull(varString);
+ this.requiredChars = checkNotNull(requiredChars);
+ this.varMap = checkNotNull(varMap);
+ }
+
+ /** Applies a variable function to produce a new, potentially resolved, VarString. */
+ VarString apply(Function<Character, String> varFn) {
+ ImmutableMap.Builder<Character, String> newVarMap = ImmutableMap.builder();
+ newVarMap.putAll(this.varMap);
+ for (Character c : requiredChars) {
+ if (!varMap.containsKey(c)) {
+ // Allowed to return null if the function cannot resolve a variable.
+ String v = varFn.apply(c);
+ if (v != null) {
+ newVarMap.put(c, v);
+ }
+ }
+ }
+ return new VarString(varString, requiredChars, newVarMap.build());
+ }
+
+ /** Returns a resolved value if all variables are available for substitution. */
+ Optional<String> resolve() {
+ return varMap.keySet().equals(requiredChars)
+ ? Optional.of(
+ RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c)))
+ : Optional.empty();
+ }
+
+ /** Returns the resolved value or fails if not all variables are available. */
+ String get() {
+ checkState(varMap.keySet().equals(requiredChars), "unresolved variable string: %s", this);
+ return RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c));
+ }
+
+ @Override public String toString() {
+ return varString + ": " + varMap;
+ }
+}
--- /dev/null
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
\ No newline at end of file
--- /dev/null
+# ldml2icu_locale.txt
+#
+# © 2016 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+#
+# Used by LdmlLocaleMapper.
+# Data-driven file for mapping LDML locale paths to ICU paths.
+# See ldml2icu_readme.txt for a detailed explanation of this file.
+
+# Variables
+# Attribute value
+%A=[^"']++
+# Word
+%W=[\w\-]++
+# Greedy word match
+%G=[\w\-]+
+# Number match
+%N=\d++
+# The default numbering system to be used.
+%D=//ldml/numbers/defaultNumberingSystem
+
+# Main locale data
+
+# Aliases
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/alias[@source="locale"][@path="../calendar[@type='(%A)']"]
+ ; /calendar/$1lo ; values=/LOCALE/calendar/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dayPeriods"]
+ ; /calendar/$1/AmPmMarkers:alias ; values=/LOCALE/calendar/$2/AmPmMarkers
+ ; /calendar/$1/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/$2/AmPmMarkersNarrow
+ ; /calendar/$1/NoonMarker:alias ; values=/LOCALE/calendar/$2/NoonMarker
+ ; /calendar/$1/NoonMarkerNarrow:alias ; values=/LOCALE/calendar/$2/NoonMarkerNarrow
+
+//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
+ ; /calendar/gregorian/AmPmMarkers:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
+//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
+ ; /calendar/gregorian/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(eras|quarters|cyclicNameSets|monthPatterns)/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
+ ; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/alias[@source="locale"][@path="../eraAbbr"]
+ ; /calendar/$1/eras/narrow:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/alias[@source="locale"][@path="../eraAbbr"]
+ ; /calendar/$1/eras/wide:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/alias[@source="locale"][@path="../\2[@type='(%A)']"]
+ ; /calendar/$1/$2s/$3:alias ; values=/LOCALE/calendar/$1/$2s/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../../../\4Set[@type='(%A)']/\4Context[@type='(%A)']/\4Width[@type='(%A)']"]
+ ; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$7/$8/$9
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../\4Width[@type='(%A)']"]
+ ; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5/$7
+//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet|monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
+ ; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$5/$6
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
+ ; /calendar/$1/$2Names:alias ; values=/LOCALE/calendar/$3/$2Names
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
+ ; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$3/$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
+ ; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$5/$6
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
+ ; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateFormats"]
+ ; /calendar/$1/DateTimePatterns:alias ; values=/LOCALE/calendar/$2/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateTimeFormats"]
+ ; /calendar/$1/availableFormats:alias ; values=/LOCALE/calendar/$2/availableFormats
+ ; /calendar/$1/appendItems:alias ; values=/LOCALE/calendar/$2/appendItems
+ ; /calendar/$1/intervalFormats:alias ; values=/LOCALE/calendar/$2/intervalFormats
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/(availableFormats|appendItems|intervalFormats)/alias[@source="locale"][@path="../../../calendar[@type='(%A)']/dateTimeFormats/\2"]
+ ; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
+
+//ldml/units/unitLength[@type="long"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
+ ; /units:alias ; values=/LOCALE/unitsShort
+//ldml/units/unitLength[@type="narrow"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
+ ; /unitsNarrow:alias ; values=/LOCALE/unitsShort
+
+//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern"]
+ ; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/standard/start
+ ; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/standard/middle
+ ; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/standard/end
+ ; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/standard/2
+//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern[@type='(%A)']"]
+ ; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/$2/start
+ ; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/$2/middle
+ ; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/$2/end
+ ; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/$2/2
+
+//ldml/numbers/currencyFormats[@numberSystem="(%A)"]/currencyFormatLength/currencyFormat[@type="accounting"]/alias[@source="locale"][@path="../(%W)[@type='standard']"] ; /NumberElements/$1/patterns/accountingFormat:alias ; values=/LOCALE/NumberElements/$1/patterns/$2
+
+# Characters
+
+//ldml/characters/exemplarCharacters[@type="auxiliary"] ; /AuxExemplarCharacters
+//ldml/characters/exemplarCharacters[@type="currencySymbol"] ; /ExemplarCharactersCurrency
+//ldml/characters/exemplarCharacters[@type="index"] ; /ExemplarCharactersIndex
+//ldml/characters/exemplarCharacters[@type="punctuation"] ; /ExemplarCharactersPunctuation
+//ldml/characters/exemplarCharacters[@type="numbers"] ; /ExemplarCharactersNumbers
+//ldml/characters/exemplarCharacters ; /ExemplarCharacters
+
+//ldml/characters/ellipsis[@type="(%A)"] ; /Ellipsis/$1
+//ldml/characters/moreInformation ; /MoreInformation
+//ldml/characters/special/icu:scripts/icu:script[@type="%N"] ; /LocaleScript
+
+//ldml/characters/parseLenients[@scope="(%A)"][@level="(%A)"]/parseLenient[@sample="%A"] ; /parse/$1/$2
+
+# Defaults
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/\2[@type="(%W)"]
+ ; /calendar/$1/$2s/$3/$4/$5
+
+# Dates
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/cyclicNameSets/cyclicNameSet[@type="(%A)"]/cyclicNameContext[@type="(%A)"]/cyclicNameWidth[@type="(%A)"]/cyclicName[@type="(%A)"]
+ ; /calendar/$1/cyclicNameSets/$2/$3/$4 ;
+
+# ---- /calendar/xxx/DateTimePatterns
+# Rules are split to force manual ordering within the array produced by them (they share the same output path).
+#
+# Note that (like many other places) the uncaptured "type" attributes are just expected to be "standard", and the %A
+# variable is only used to save a bit of space. The final output array has 3 groups ("time" -> "date" -> "date-time")
+# each with 4 elements in based on the pattern length ("full" -> "long" -> "medium" -> "short") giving 12 patterns in
+# total.
+#
+# However due to an awful hack, there end up being 13 values in the array, with the medium date-time value being
+# duplicated at index 8. However this hack is done later, because the regex transformer does not permit the same
+# CLDR path to emit values in different places in an array.
+
+# Time patterns (4 x values)
+//ldml/dates/calendars/calendar[@type="(%A)"]/(timeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+
+# Date patterns (4 x values)
+#
+# This is a weird edge case. When the number attribute is present in the xpath, its value needs to be grouped
+# together with the xpath value in its own special array, which is treated like just another value in
+# /DateTimePatterns. The group keyword is used here to specify that values from the same xpath should be grouped
+# into their own separate array. Since each possible pattern length can have patterns with and without the number
+# attribute, we must explicitly split the rules to enforce correct output order.
+#
+# So far (Jan 2014), this only happens in the Chinese calendar for ja/zh/zh_Hant and the Hebrew calendar for he,
+# and all calendars for haw (which has numbers="M=romanlow").
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+ ; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+ ; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+ ; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
+ ; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
+
+# DateTime patterns (4 x values)
+//ldml/dates/calendars/calendar[@type="(%A)"]/(dateTimeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
+ ; /calendar/$1/DateTimePatterns
+# ----
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/appendItems/appendItem[@request="(%A)"] ; /calendar/$1/appendItems/$2
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"] ; /calendar/$1/availableFormats/$2
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"][@count="(%A)"] ; /calendar/$1/availableFormats/$2/$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id="(%A)"]/greatestDifference[@id="(%A)"] ; /calendar/$1/intervalFormats/$2/$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatFallback ; /calendar/$1/intervalFormats/fallback
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkers%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersNarrow%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersAbbr%$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkers
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersAbbr
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersNarrow
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/dayPeriod/$2/$3/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
+//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/narrow%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/abbreviated%$3
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/wide%$3
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"] ; /calendar/$1/eras/narrow
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"] ; /calendar/$1/eras/abbreviated
+//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"] ; /calendar/$1/eras/wide
+
+# Leap year names go after other month names.
+# "yeartype" is an #IMPLIED attribute in the DTD and it should implicitly default to "standard".
+# In practice "standard" is never explicitly given, but it could be (so must match it here).
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"](?:[@yeartype="standard"])? ; /calendar/$1/$2Names/$3/$4
+//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"][@yeartype="leap"] ; /calendar/$1/$2Names/$3/$4
+
+//ldml/dates/calendars/calendar[@type="(%A)"]/(quarters)/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="%A"] ; /calendar/$1/$2/$3/$4
+
+//ldml/dates/fields/field[@type="(%A)"]/displayName[@alt="(%A)"] ; /fields/$1/dn%$2
+//ldml/dates/fields/field[@type="(%A)"]/displayName ; /fields/$1/dn
+//ldml/dates/fields/field[@type="(%A)"]/relative[@type="(%A)"] ; /fields/$1/relative/"$2"
+//ldml/dates/fields/field[@type="(%A)"]/relativePeriod ; /fields/$1/relativePeriod
+//ldml/dates/fields/field[@type="(%A)"]/relativeTime[@type="(%A)"]/relativeTimePattern[@count="(%A)"] ; /fields/$1/relativeTime/$2/$3
+
+//ldml/dates/fields/field[@type="(%A)"]/alias[@source="locale"][@path="../field[@type='(%A)']"] ; /fields/$1:alias ; values=/LOCALE/fields/$2
+
+//ldml/dates/timeZoneNames/regionFormat[@type="daylight"] ; /zoneStrings/regionFormatDaylight
+//ldml/dates/timeZoneNames/regionFormat[@type="standard"] ; /zoneStrings/regionFormatStandard
+//ldml/dates/timeZoneNames/(%GFormat) ; /zoneStrings/$1
+
+//ldml/dates/timeZoneNames/metazone[@type="(%A)"]/(\w)%W/(\w)%W ; /zoneStrings/"meta:$1"/$2$3
+
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2"/ec%$3
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2"/ec
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/(\w)%W/(\w)%W ; /zoneStrings/"$1:$2"/$3$4
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2:$3"/ec%$4
+//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2:$3"/ec
+
+# Locale Display Names
+
+//ldml/localeDisplayNames/codePatterns/codePattern[@type="(%A)"] ; /codePatterns/$1
+//ldml/localeDisplayNames/annotationPatterns/annotationPattern[@type="(%A)"] ; /codePatterns/$1
+
+//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
+
+//ldml/localeDisplayNames/languages/language[@type="(%A)"][@alt="(%A)"] ; /Languages%$2/$1
+//ldml/localeDisplayNames/languages/language[@type="(%A)"] ; /Languages/$1
+
+//ldml/localeDisplayNames/localeDisplayPattern/localeKeyTypePattern ; /localeDisplayPattern/keyTypePattern
+//ldml/localeDisplayNames/localeDisplayPattern/localePattern ; /localeDisplayPattern/pattern
+//ldml/localeDisplayNames/localeDisplayPattern/localeSeparator ; /localeDisplayPattern/separator
+
+//ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type="(%A)"] ; /measurementSystemNames/$1
+
+//ldml/localeDisplayNames/scripts/script[@type="(%A)"][@alt="(%A)"] ; /Scripts%$2/$1
+//ldml/localeDisplayNames/scripts/script[@type="(%A)"] ; /Scripts/$1
+
+//ldml/localeDisplayNames/territories/territory[@type="(%A)"][@alt="(%A)"] ; /Countries%$2/$1
+//ldml/localeDisplayNames/territories/territory[@type="(%A)"] ; /Countries/$1
+
+//ldml/localeDisplayNames/transformNames/transformName[@type="(%W)"] ; /transformNames/$1
+
+//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"][@alt="(%A)"] ; /Types%$3/$1/$2
+//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"] ; /Types/$1/$2
+
+//ldml/localeDisplayNames/variants/variant[@type="(%A)"][@alt="(%A)"] ; /Variants%$2/$1
+//ldml/localeDisplayNames/variants/variant[@type="(%A)"] ; /Variants/$1
+
+# Numbers
+
+//ldml/numbers/currencies/currency[@type="(%A)"]/displayName[@count="(%A)"] ; /CurrencyPlurals/$1/$2
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol[@alt="(%A)"] ; /Currencies%$2/$1
+
+# ---- /Currencies/XXX bundles
+# Ordering of rules is critical here since they write into the same resource bundle path and the
+# last 3 values are grouped together as a single value (via the special <FIFO> hidden label).
+#
+# Note that the <FIFO> label is needed here (not the "group" instruction) because the grouped
+# values must be seen as having a resource bundle path that is a child of the "/Currencies/$1"
+# path. This is so that the grouped values only appear when one of them is present rather than
+# whenever any of the other values in the main resource bundle path exist.
+#
+# Due to the optional nature of the final sub-array in the bundle, it would be very hard to ever
+# add more elements after it.
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
+ ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
+ ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/pattern[@type="standard"]
+ ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencyFormatLength/currencyFormat[@type="standard"]/pattern[@type="standard"]
+//ldml/numbers/currencies/currency[@type="(%W)"]/decimal
+ ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
+//ldml/numbers/currencies/currency[@type="(%W)"]/group
+ ; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/group
+# ----
+
+//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencySpacing/(%W)/(%W) ; /currencySpacing/$1/$2
+//ldml/numbers/currencyFormats[@numberSystem="%D"]/unitPattern[@count="(%W)"] ; /CurrencyUnitPatterns/$1
+
+//ldml/numbers/defaultNumberingSystem[@alt="(%A)"] ; /NumberElements/default_$1
+//ldml/numbers/defaultNumberingSystem ; /NumberElements/default
+//ldml/numbers/minimumGroupingDigits ; /NumberElements/minimumGroupingDigits
+//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1
+
+//ldml/numbers/symbols[@numberSystem="(%A)"]/(%W) ; /NumberElements/$1/symbols/$2
+//ldml/numbers/(%GFormat)s[@numberSystem="(%W)"]/\1Length/\1[@type="standard"]/pattern[@type="standard"] ; /NumberElements/$2/patterns/$1
+//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength/currencyFormat[@type="accounting"]/pattern[@type="standard"] ; /NumberElements/$1/patterns/accountingFormat
+//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength[@type="short"]/currencyFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/currencyFormat/$2/$3
+//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="short"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/decimalFormat/$2/$3
+//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="long"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsLong/decimalFormat/$2/$3
+
+//ldml/numbers/miscPatterns[@numberSystem="(%W)"]/pattern[@type="(%W)"] ; /NumberElements/$1/miscPatterns/$2
+//ldml/numbers/minimalPairs/ordinalMinimalPairs[@ordinal="(%A)"] ; /NumberElements/minimalPairs/ordinal/$1
+//ldml/numbers/minimalPairs/pluralMinimalPairs[@count="(%A)"] ; /NumberElements/minimalPairs/plural/$1
+
+# Misc
+
+# Ordering of rules is critical here since they write into the same resource bundle path.
+//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="uiListOrMenu"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
+//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="stand-alone"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
+
+//ldml/delimiters/(%W) ; /delimiters/$1
+
+//ldml/layout/orientation/(%G)Order ; /layout/$1s
+
+//ldml/listPatterns/listPattern/listPatternPart[@type="(%A)"] ; /listPattern/standard/$1
+//ldml/listPatterns/listPattern[@type="(%A)"]/listPatternPart[@type="(%A)"] ; /listPattern/$1/$2
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsNarrow/$1/$2/dnam
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsShort/$1/$2/dnam
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/displayName ; /units/$1/$2/dnam
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsNarrow/$1/$2/$3
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsShort/$1/$2/$3
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /units/$1/$2/$3
+
+//ldml/units/unitLength[@type="narrow"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsNarrow/compound/$1
+//ldml/units/unitLength[@type="short"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsShort/compound/$1
+//ldml/units/unitLength[@type="long"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /units/compound/$1
+
+//ldml/units/unitLength[@type="narrow"]/coordinateUnit/displayName ; /unitsNarrow/coordinate/dnam
+//ldml/units/unitLength[@type="short"]/coordinateUnit/displayName ; /unitsShort/coordinate/dnam
+//ldml/units/unitLength[@type="long"]/coordinateUnit/displayName ; /units/coordinate/dnam
+
+//ldml/units/unitLength[@type="narrow"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsNarrow/coordinate/$1
+//ldml/units/unitLength[@type="short"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsShort/coordinate/$1
+//ldml/units/unitLength[@type="long"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /units/coordinate/$1
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsNarrow/$1/$2/per
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsShort/$1/$2/per
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /units/$1/$2/per
+
+//ldml/units/durationUnit[@type="(%A)"]/durationUnitPattern ; /durationUnits/$1
+
+//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsNarrow/$1/$2:alias ; values=/LOCALE/unitsNarrow/$3/$4
+//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsShort/$1/$2:alias ; values=/LOCALE/unitsShort/$3/$4
+//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /units/$1/$2:alias ; values=/LOCALE/units/$3/$4
+
+//ldml/characterLabels/characterLabelPattern[@type="(%A)"][@count="(%A)"] ; /characterLabelPattern/$1/$2
+//ldml/characterLabels/characterLabelPattern[@type="(%A)"] ; /characterLabelPattern/$1
+//ldml/characterLabels/characterLabel[@type="(%A)"] ; /characterLabel/$1
--- /dev/null
+# README for configuration files used by org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer.
+#
+# © 2019 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+
+======
+Basics
+======
+
+The RegexTransformer class converts CLDR paths and values to ICU Resource Bundle paths
+and values, based on a set of transformation rules typically loaded from a text file
+(e.g. ldml2icu_locale.txt).
+
+The basic format of transformation rules is:
+ <path-specification> ; <resource-bundle-specification> [; <instruction>=<argument>]*
+
+A simple example of a transformation rule is:
+
+ //ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
+
+which transforms CLDR values whose path matches the path specification, and emits:
+* A resource bundle path "/Keys/xx", where 'xx' is the captured type attribute.
+* A resource bundle value, which is just the CLDR value's base value.
+
+A path specification can be thought of as a regular expression which matches the CLDR
+path and can capture some element names or attribute values; however unlike a regular
+expression, the '[',']' characters are treated as literals, similar to XPath expressions.
+
+If a single CLDR value should produce more than one resource bundle path/value, then
+it should be written:
+
+ <path-specification>
+ ; <resource-bundle-1-specification> [; <instruction> ]*
+ ; <resource-bundle-2-specification> [; <instruction> ]*
+
+=====================
+Argument Substitution
+=====================
+
+Before a rule can be matched, any %-variables must be substituted. These are defined
+in the same configuration file as the rules, and look something like:
+ %W=[\w\-]++
+or:
+ %D=//ldml/numbers/defaultNumberingSystem
+
+The first case can be thought of as just a snippet of regular expression (in this case
+something that matches hyphen separated words) and, importantly, here '[' and ']' are
+treated as regular expression metacharacters. These arguments are static and wil be
+substituted exactly as-is into the regular expression to be used for matching.
+
+The second case (used exactly once) is a dynamic argument which references a CLDR value
+in the set of data being transformed. This is simply indicated by the fact that it starts
+with '//'. This path is resolved and the value is substituted just prior to matching.
+
+Variable names are limited to a single upper-case letter (A-Z).
+
+===========================
+Implicit Argument Splitting
+===========================
+
+This is a (somewhat non-obvious) mechanism which allows for a single rule to generate
+multiple results from a single input path when a argument is a list of tokens.
+
+Consider the rule:
+
+//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
+ ; /timeData/$3/allowed ; values=$1
+ ; /timeData/$3/preferred ; values=$2
+
+where the "regions" attributes (which is captured as '$3') contains a whitespace separated
+list of region codes (e.g. "US GB AU NZ"). In this case the rule is applied once for each
+region, producing paths such as "/timeData/US/allowed" or "/timeData/NZ/preferred". Note
+that there is no explicit instruction to do this, it just happens.
+
+The rule is that the first unquoted argument in the resource bundle path is always treated
+as splittable.
+
+To suppress this behaviour, the argument must be quoted (e.g. /timeData/"$3"/allowed). Now,
+if there were another following unquoted argument, that would become implicitly splittable
+(but only one argument is ever splittable).
+
+============
+Instructions
+============
+
+Additional instructions can be supplied to control value transformation and specify fallback
+values. The set of instructions is:
+* values: The most common instruction which defines how values are transformed.
+* fallback: Defines a fallback value to be used if this rule was not matched.
+
+There are two other special case instructions which should (if at all possible) not be used,
+and might be removed at some point:
+* group: Causes values to be grouped as sub-arrays for very specific use cases
+ (prefer using "Hidden Labels" where possible).
+* base_xpath: Allows deduplication of results between multiple different rules (this is a
+ hack to work around limitations in how matching is performed).
+
+-------------------
+values=<expression>
+-------------------
+
+The "values" instruction defines an expression whose evaluated result becomes the output
+resource bundle value(s). Unless quoting is present, this evaluated expression is split
+on whitespace and can become multiple values in the resulting resource bundle.
+
+Examples:
+
+* values=$1 $2 $3
+
+ Produces three separate values in the resource bundle for the first three captured
+ arguments.
+
+* values="$1 $2" $3
+
+ Produces two values in the resource bundle, the first of which is two captured values
+ separated by a space character.
+
+* values={value}
+
+ Substitutes the CLDR value, but then performs whitespace splitting on the result. This
+ differs from the behaviour when no "values" instructions is present (which does not
+ split the results).
+
+* values="{value}" $1
+
+ Produces two values, the first of which is the unsplit CLDR value, and the second is a
+ captured argument.
+
+* values=&func($1, {value})
+
+ Invokes a transformation function, passing in a captured argument and the CLDR value,
+ and the result is then split. The set of functions available to a transformer is
+ configured when it is created.
+
+Note that in the above examples, it is assumed that the $N arguments do not contain spaces.
+If they did, it would result in more output values. To be strict about things, every value
+which should not be split must be quoted (e.g. values="$1" "$2" "$3") but since captured
+values are often IDs or other tokens, this is not what is seen in practice, so it is not
+reflected in these examples.
+
+---------------------
+fallback=<expression>
+---------------------
+
+The fallback instruction provides a way for default values to be emitted for a path that
+was not matched. Fallbacks are useful when several different rules produce values for the
+same resource bundle. In this case the output path produced by one rule can be used as
+the "key" for any unmatched rules with fallback values (to "fill in the gaps").
+
+Consider the two rules which can emit the same resource bundle path:
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
+ ; /Currencies/$1 ; fallback=$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
+ ; /Currencies/$1 ; fallback=$1
+
+These rules, if both matched, will produce two values for the same resource bundle path.
+Consider the CLDR values:
+
+//ldml/numbers/currencies/currency[@type="USD"]/symbol ==> "$"
+//ldml/numbers/currencies/currency[@type="USD"]/displayName ==> "US Dollar"
+
+After matching both of these paths, the values for the resource bundle "/Currencies/USD"
+will be the array { "$", "US Dollar" }.
+
+However, if only one value were present to be converted, the converter could use the
+matched path "/Currencies/XXX" and infer the missing fallback value, ensuring that the
+output array (it if was emitted at all) was always two values.
+
+Note that in order for this to work, the fallback value must be derivable only from the
+matched path. E.g. it cannot contain arguments that are not also present in the matched
+path, and obviously cannot reference the "{value}" at all. Thus the following would not
+be permitted:
+
+//ldml/foo/bar[@type="(%W)"][@region=(%A)] ; /Foo/$1 ; fallback=$2
+
+However the fallback value can reference existing CLDR or resource bundle paths (expected
+to be present from other rules). For example:
+ fallback=/weekData/001:intvector[0]
+or:
+ fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
+
+The latter case is especially complex because it also uses the "dynamic" argument:
+ %D=//ldml/numbers/defaultNumberingSystem
+
+So determining the resulting value will require:
+1) resolving "//ldml/numbers/defaultNumberingSystem" to, for example, "arab"
+2) looking up the value of "//ldml/numbers/symbols[@numberSystem="arab"]/decimal"
+
+-----------------
+base_xpath=<path>
+-----------------
+
+The base_xpath instruction allows a rule to specify a proxy path which is used in place of
+the originally matched path in the returned result. This is a useful hack for cases where
+values are derived from information in a path prefix.
+
+Because path matching for transformation happens only on full paths, it is possible that
+several distinct CLDR paths might effectively generate the same result if they share the
+same prefix (i.e. paths in the same "sub hierarchy" of the CLDR data).
+
+If this happens, then you end up generating "the same" result from different paths. To
+fix this, a "surrogate" CLDR path can be specified as a proxy for the source path,
+allowing several results to appears to have come from the same source, which results in
+deduplication of the final value.
+
+For example, the two rules :
+
+//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+Produce the same results for different paths (with or without the "officialStatus"
+attribute) but only one such result is desired. By specifying the same base_xpath on
+both rules, the conversion logic can deduplicate these to produce only one result.
+
+When using base_xpath, it is worth noting that:
+1) Base xpaths must be valid "distinguishing" paths (but are never matched to any rule).
+2) Base xpaths can use arguments to achieve the necessary level of uniqueness.
+3) Rules which share the same base xpath must always produce the same values.
+
+Note however that this is a still very much a hack because since two rules are responsible
+for generating the same result, there is no well defined "line number" to use for ordering
+of values. Thus this mechanism should only be used for rules which produce "single"
+values, and must not be used in cases where the ordering of values in arrays is important.
+
+This mechanism only exists because there is currently no mechanism for partial matching
+or a way to match one path against multiple rules.
+
+-----
+group
+-----
+
+The "group" instruction should be considered a "last resort" hack for controlling value
+grouping, in cases where "hidden labels" are not suitable (see below).
+
+==============================
+Value Arrays and Hidden Labels
+==============================
+
+In the simplest case, one rule produces one or more output path/values per matched CLDR
+value (i.e. one-to-one or one-to-many). If that happens, then output ordering of the
+resource bundle paths is just the natural resource bundle path ordering.
+
+However it is also possible for several rules to produce values for a single output path
+(i.e. many-to-one). When this happens there are some important details about how results
+are grouped and ordered.
+
+------------
+Value Arrays
+------------
+
+If several rules produce results for the same resource bundle path, the values produced
+by the rules are always ordered according to the order of the rule in the configuration
+rule (and it is best practice to group any such rules together for clarity).
+
+If each rule produces multiple values, then depending on grouping, those values can either
+be concatenated together in a single array or grouped individually to create an array
+of arrays.
+
+In the example below, there are four rules producing values for the same path (
+
+//.../firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1)
+//.../minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1
+//.../weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0
+//.../weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000
+
+The first two rules produce one value each, and the last two produce two values each. This
+results in the resource bundle "/weekData/xxx:intvector" having a single array consisting
+of six values. In the real configuration, these rules also use fallback instructions to
+ensure that the resulting array of values is always six values, even if some CLDR paths are
+not present.
+
+-------------
+Hidden Labels
+-------------
+
+Sometimes rules should produce separate "sub-arrays" of values, rather than having all the
+values appended to a single array. Consider the following path/value pairs:
+
+x/y: a
+x/y: b
+x/y: c
+
+Which produce the resource bundle "x/y" with three values:
+
+x{
+ y{
+ "a",
+ "b",
+ "c"
+ }
+}
+
+Now suppose we want to make a resource bundle where the values are grouped into their
+own sub-array:
+
+x{
+ y{
+ { "a", "b", "c" }
+ }
+}
+
+We can think of this as coming from the path/value pairs:
+
+x/y/-: a
+x/y/-: b
+x/y/-: c
+
+where to represent the sub-array we introduce the idea of an empty path element '-'.
+
+In a transformation rule, these "empty elements" are represent as "hidden labels", and look
+like "<some-label>". They are treated as "normal" path elements for purposes of ordering and
+grouping, but are treated as empty when the paths are written to the ICU data files.
+
+For example the rule:
+
+//.../currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
+
+Generates a series of grouped, 2-element sub-arrays split by the captured type attribute.
+
+ codeMappingCurrency{
+ { type-1, numeric-1 }
+ { type-2, numeric-2 }
+ { type-3, numeric-3 }
+ }
+
+<FIFO> is a special hidden label which is substituted for in incrementing counting when
+sorting paths. It ensures that values in the same array are sorted in the order that they
+were encountered. However this mechanism imposes a strict requirement that the ordering
+of CLDR values to be transformed matches the expected ICU value order, so it should be
+avoided where possible to avoid this implicit, subtle dependency. Note that this mechanism
+is currently only enabled for the transformation of "supplemental data" and may eventually
+be removed.
+
+Hidden labels are a neat solution which permits the generation of sub-array values, but they
+don't quite work in every case. For example if you need to produce a resource bundle with a
+mix of values and sub-arrays, like:
+
+x{
+ y{
+ "a",
+ { "b", "c" }
+ "d"
+ }
+}
+
+which can be thought of as coming from the path/value pairs:
+
+x/y: a
+x/y/<z>: b
+x/y/<z>: c
+x/y: d
+
+we find that, after sorting the resource bundle paths, we end up with:
+
+x/y: a
+x/y: d
+x/y/<z>: b
+x/y/<z>: c
+
+which produces the wrong result. This happens because values with different paths are
+sorted primarily by their path. I cases like this, where a mix of values and sub-arrays
+are required, the "group" instruction can be used instead.
+
+For example:
+
+//ldml/numbers/currencies/currency[@type="(%W)"]/symbol ; /Currencies/$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/displayName ; /Currencies/$1
+//ldml/numbers/currencies/currency[@type="(%W)"]/pattern ; /Currencies/$1 ; group
+//ldml/numbers/currencies/currency[@type="(%W)"]/decimal ; /Currencies/$1 ; group
+//ldml/numbers/currencies/currency[@type="(%W)"]/group ; /Currencies/$1 ; group
+
+Produces resource bundles which look like:
+
+Currencies{
+ xxx{
+ "<symbol>",
+ "<display name>",
+ { "<pattern>", "<decimal>", "<group>" }
+ }
+}
--- /dev/null
+# ldml2icu_supplemental.txt
+#
+# © 2016 and later: Unicode, Inc. and others.
+#
+# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+# For terms of use, see http://www.unicode.org/copyright.html
+#
+# Used by SupplementalMapper.
+# Data-driven file for mapping supplemental LDML paths to ICU paths.
+# See ldml2icu_readme.txt for a detailed explanation of this file.
+
+# Attribute value
+%A=[^"']++
+# Attribute value, no underscore
+%B=[^"'_]++
+# Word/Zone match
+%W=[\s\w\-/]++
+# Greedy word match
+%G=[\s\w\-]+
+# Number match
+%N=[\d\.]++
+
+# supplementalData.xml
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$2
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"][@tender="false"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$2
+ ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+ ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"][@tender="false"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+ ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+ ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$4
+ ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+ ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"][@tender="false"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$4
+ ; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
+ ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
+ ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@to="(%W)"][@iso4217="(%W)"][@tender="false"]
+ ; /CurrencyMap/$1/<FIFO>/id ; values=$3
+ ; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($2, to)
+ ; /CurrencyMap/$1/<FIFO>/tender ; values=false
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashDigits="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $4 $5
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $4
+//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $3
+
+//supplementalData/calendarPreferenceData/calendarPreference[@territories="(%A)"][@ordering="(%A)"] ; /calendarPreferenceData/$1 ; values=$2
+//supplementalData/codeMappings/territoryCodes[@type="(%W)"][@numeric="(%N)"][@alpha3="(%W)"].* ; /codeMappings/<$1> ; values=$1 $2 $3
+
+//supplementalData/codeMappings/currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
+
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"][@alt="secondary"]
+ ; /languageData/$1/secondary/scripts ; values=$2
+ ; /languageData/$1/secondary/territories ; values=$3
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@alt="secondary"] ; /languageData/$1/secondary/scripts ; values=$2
+//supplementalData/languageData/language[@type="(%W)"][@territories="(%G)"][@alt="secondary"] ; /languageData/$1/secondary/territories ; values=$2
+
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
+ ; /languageData/$1/primary/scripts ; values=$2
+ ; /languageData/$1/primary/territories; values=$3
+//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"] ; /languageData/$1/primary/scripts ; values=$2
+//supplementalData/languageData/language[@type="(%W)"][@territories="(%W)"] ; /languageData/$1/primary/territories ; values=$2
+
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="deprecated"] ; /territoryContainment/deprecated/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="grouping"] ; /territoryContainment/containedGroupings/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@grouping="true"] ; /territoryContainment/grouping/$1 ; values=$2
+//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"]; /territoryContainment/$1 ; values=$2
+
+//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1 ; values=$2
+//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@subtype="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1-$2 ; values=$3
+
+//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"](?:[@references="(?:%A)"])?[@alt="(%A)"] ; /weekData%$3/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
+
+//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
+//supplementalData/weekData/minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1 ; fallback=/weekData/001:intvector[1]
+//supplementalData/weekData/weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0 ; fallback=/weekData/001:intvector[2] /weekData/001:intvector[3]
+//supplementalData/weekData/weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000 ; fallback=/weekData/001:intvector[4] /weekData/001:intvector[5]
+
+//supplementalData/weekData/weekOfPreference[@locales="(%A)"][@ordering="(%A)"] ; /weekOfPreference/$1 ; values=$2
+
+//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
+ ; /timeData/$3/allowed ; values=$1
+ ; /timeData/$3/preferred ; values=$2
+
+//supplementalData/measurementData/measurementSystem[@type="metric"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=0
+//supplementalData/measurementData/measurementSystem[@type="US"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=1
+//supplementalData/measurementData/measurementSystem[@type="UK"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=2
+
+//supplementalData/measurementData/measurementSystem[@type="metric"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=0
+//supplementalData/measurementData/measurementSystem[@type="US"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=1
+//supplementalData/measurementData/measurementSystem[@type="UK"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=2
+//supplementalData/measurementData/paperSize[@type="A4"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=297 210
+//supplementalData/measurementData/paperSize[@type="US-Letter"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=279 216
+
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-small-informal
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2-small
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-informal
+//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+ ; /territoryInfo/$1/$5/officialStatus ; values=$8
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+ ; /territoryInfo/$1/$5/officialStatus ; values=$8
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
+ ; /territoryInfo/$1/$5/officialStatus ; values=$7
+
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"](?:[@references="%W"])?
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+ ; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
+
+# This only exists right now for 'ZZ', which has no <languagePopulation> child elements.
+//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]
+ ; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
+
+//supplementalData/calendarData/calendar[@type="(%W)"]/calendarSystem[@type="(%W)"] ; /calendarData/$1/system ; values=$2
+//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"][@named="(%W)"]
+ ; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
+ ; /calendarData/$1/eras/$2/named ; values=$5
+//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"]
+ ; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
+
+# languageInfo.xml
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/paradigmLocales[@locales="(%A)"] ; /languageMatchingInfo/$1/paradigmLocales ; values=$2
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/matchVariable[@id="\$(%A)"][@value="(%A)"] ; /languageMatchingInfo/$1/matchVariable/$2 ; values=$3
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"][@oneway="true"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 1
+//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 0
+
+//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"][@oneway="true"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 1
+//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 0
+
+# likelySubtags.xml
+//supplementalData/likelySubtags/likelySubtag[@from="(%A)"][@to="(%A)"] ; /$1 ; values=$2
+
+# metaZones.xml - metaZones.txt
+//supplementalData/metaZones/mapTimezones[@type="metazones"]/mapZone[@type="(%A)"][@other="(%W)"][@territory="(%W)"] ; /mapTimezones/$2/$3 ; values=$1
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$2
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$3 "$2" "9999-12-31 23:59"
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$4 "$2" "$3"
+//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<1970-01-01 00:00> ; values=$3 "1970-01-01 00:00" "$2"
+
+//supplementalData/primaryZones/primaryZone[@iso3166="(%W)"] ; /primaryZones/$1 ; values={value}
+
+# numberingSystems.txt
+//supplementalData/numberingSystems/numberingSystem[@type="algorithmic"][@id="(%W)"][@rules="(%A)"]
+ ; /numberingSystems/$1/algorithmic:int ; values=1
+ ; /numberingSystems/$1/desc ; values=&algorithm($2)
+ ; /numberingSystems/$1/radix:int ; values=10
+
+//supplementalData/numberingSystems/numberingSystem[@type="numeric"][@id="(%W)"][@digits="(%A)"]
+ ; /numberingSystems/$1/algorithmic:int ; values=0
+ ; /numberingSystems/$1/desc ; values=$2
+ ; /numberingSystems/$1/radix:int ; values=10
+
+# windowsZones.txt
+//supplementalData/windowsZones/mapTimezones/mapZone[@type="(%A)"][@other="(%A)"][@territory="(%W)"] ; /mapTimezones/"$2"/$3 ; values="$1"
+
+# genderList.txt
+//supplementalData/gender/personList[@type="(%W)"][@locales="(%W)"] ; /genderList/$2 ; values=$1
+
+# locale info
+//supplementalData/parentLocales/parentLocale[@parent="(%A)"][@locales="(%A)"] ; /parentLocales/$1 ; values=$2
+
+# supplementalMetadata.xml (metadata.txt)
+//supplementalData/metadata/defaultContent[@locales="(%A)"] ; /defaultContent ; values=$1
+//supplementalData/metadata/alias/(language|script|territory|subdivision|variant)Alias[@type="(%A)"][@replacement="(%A)"][@reason="(%A)"]
+ ; /alias/$1/$2/reason ; values="$4"
+ ; /alias/$1/$2/replacement ; values="$3"
+
+# Region codes used by ICU's Region class
+# Specify the value explicitly so that the LDMLConverter will split it.
+//supplementalData/metadata/validity/variable[@type="choice"][@id="\$territory"] ; /regionCodes ; values={value}
+
+# validity
+//supplementalData/idValidity/id[@type="(%A)"][@idStatus="(%A)"] ; /idValidity/$1/$2 ; values={value}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth8.assertThat;
+import static org.junit.Assert.fail;
+import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrPath;
+
+@RunWith(JUnit4.class)
+public class PathMatcherTest {
+ @Test
+ public void testMatcher() {
+ CldrPath calEra = parseDistinguishingPath(
+ "//ldml/dates/calendars/calendar[@type=\"buddhist\"]/eras/eraAbbr/era[@type=\"0\"]");
+ CldrPath chineseMon1 = monthInfo("chinese", "format", "abbreviated", 1);
+ CldrPath chineseMon2 = monthInfo("chinese", "format", "abbreviated", 2);
+ CldrPath genericMon1 = monthInfo("generic", "stand-alone", "narrow", 1);
+ CldrPath genericMon2 = monthInfo("generic", "stand-alone", "narrow", 2);
+ List<CldrPath> calPaths =
+ Arrays.asList(calEra, chineseMon1, chineseMon2, genericMon1, genericMon2);
+
+ PathMatcher anyCalendarPaths = PathMatcher.of("ldml/dates/calendars/calendar");
+ assertThat(calPaths.stream().allMatch(anyCalendarPaths::matchesPrefixOf)).isTrue();
+ assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matches)).isTrue();
+ assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matchesSuffixOf)).isTrue();
+
+ PathMatcher chineseCalendars =
+ PathMatcher.of("ldml/dates/calendars/calendar[@type=\"chinese\"]");
+ assertThat(calPaths.stream().filter(chineseCalendars::matchesPrefixOf))
+ .containsExactly(chineseMon1, chineseMon2);
+
+ PathMatcher anyMonth = PathMatcher.of("monthWidth[@type=*]/month[@type=*]");
+ assertThat(calPaths.stream().filter(anyMonth::matchesSuffixOf))
+ .containsExactly(chineseMon1, chineseMon2, genericMon1, genericMon2);
+
+ PathMatcher narrowMonth = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
+ assertThat(calPaths.stream().filter(narrowMonth::matchesSuffixOf))
+ .containsExactly(genericMon1, genericMon2);
+ assertThat(calPaths.stream().filter(narrowMonth::matches)).isEmpty();
+
+ PathMatcher firstMonth = PathMatcher.of("month[@type=\"1\"]");
+ assertThat(calPaths.stream().filter(firstMonth::matchesSuffixOf))
+ .containsExactly(chineseMon1, genericMon1);
+
+ PathMatcher fullMatch = PathMatcher.of("ldml/dates"
+ + "/calendars/calendar[@type=\"generic\"]"
+ + "/months/monthContext[@type=\"stand-alone\"]"
+ + "/monthWidth[@type=\"narrow\"]"
+ + "/month[@type=\"2\"]");
+ assertThat(calPaths.stream().filter(fullMatch::matches)).containsExactly(genericMon2);
+ }
+
+ @Test
+ public void testWildcardSegment() {
+ PathMatcher wildcard = PathMatcher.of("ldml/dates"
+ + "/calendars/calendar[@type=\"generic\"]"
+ + "/*/*[@type=\"format\"]/*[@type=\"narrow\"]/*[@type=*]");
+
+ assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 1))).isTrue();
+ assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 9))).isTrue();
+ assertThat(wildcard.matches(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
+
+ assertThat(wildcard.matches(monthInfo("chinese", "format", "narrow", 1))).isFalse();
+ assertThat(wildcard.matches(monthInfo("generic", "stand-alone", "narrow", 1))).isFalse();
+ assertThat(wildcard.matches(dayInfo("generic", "format", "wide", "mon"))).isFalse();
+ }
+
+ @Test
+ public void testAnyOf() {
+ PathMatcher monthMatch = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
+ PathMatcher dayMatch = PathMatcher.of("dayWidth[@type=\"narrow\"]/day[@type=*]");
+ PathMatcher combined = PathMatcher.anyOf(monthMatch, dayMatch);
+
+ assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "narrow", 1))).isTrue();
+ assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
+
+ assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "wide", 1))).isFalse();
+ assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "wide", "mon"))).isFalse();
+ }
+
+ @Test
+ public void testBadSpecifiers() {
+ assertInvalidPathSpecification("");
+ // Leading and trailing '/' are not permitted (they imply empty segments.
+ assertInvalidPathSpecification("/foo/");
+ assertInvalidPathSpecification("foo//bar");
+ assertInvalidPathSpecification("foo/bad segment name");
+ assertInvalidPathSpecification("foo/bar[type=*]");
+ assertInvalidPathSpecification("foo/bar[@type=**]");
+ assertInvalidPathSpecification("foo/bar[@type='double-quotes-only']");
+ }
+
+ private void assertInvalidPathSpecification(String spec) {
+ IllegalArgumentException e =
+ assertThrows(IllegalArgumentException.class, () -> PathMatcher.of(spec));
+ assertThat(e).hasMessageThat().startsWith("invalid path specification");
+ assertThat(e).hasMessageThat().contains(spec);
+ }
+
+ private static CldrPath monthInfo(String type, String context, String width, int number) {
+ return CldrPath.parseDistinguishingPath(String.format(
+ "//ldml/dates/calendars/calendar[@type=\"%s\"]"
+ + "/months/monthContext[@type=\"%s\"]"
+ + "/monthWidth[@type=\"%s\"]"
+ + "/month[@type=\"%d\"]",
+ type, context, width, number));
+ }
+
+ private static CldrPath dayInfo(String type, String context, String width, String id) {
+ return CldrPath.parseDistinguishingPath(String.format(
+ "//ldml/dates/calendars/calendar[@type=\"%s\"]"
+ + "/days/dayContext[@type=\"%s\"]"
+ + "/dayWidth[@type=\"%s\"]"
+ + "/day[@type=\"%s\"]",
+ type, context, width, id));
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static org.unicode.icu.tool.cldrtoicu.testing.RbPathSubjectFactory.assertThat;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth8.assertThat;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class RbPathTest {
+ @Test
+ public void testEmpty() {
+ assertThat(RbPath.empty()).hasSegments();
+ assertThat(RbPath.empty()).hasLength(0);
+ }
+
+ @Test
+ public void testParseVsOf() {
+ assertThat(RbPath.of("foo", "bar")).hasSegments("foo", "bar");
+ assertThat(RbPath.of("foo/bar")).hasSegments("foo/bar");
+ assertThat(RbPath.parse("foo/bar")).hasSegments("foo", "bar");
+ }
+
+ @Test
+ public void testBadArgs() {
+ assertBadPath("", "empty path string");
+ assertBadPath("foo//bar", "empty path segment");
+ assertBadPath("foo/<bar/baz", "mismatched quoting");
+ assertBadPath("foo/\"bar", "mismatched quoting");
+ assertBadPath("foo/\"bar\"baz\"", "invalid character");
+ assertBadPath("foo/bar baz", "invalid character");
+ }
+
+ private static void assertBadPath(String path, String errorSnippet) {
+ IllegalArgumentException e =
+ assertThrows(IllegalArgumentException.class, () -> RbPath.parse(path));
+ assertThat(e).hasMessageThat().contains(errorSnippet);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu;
+
+import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth.assertWithMessage;
+import static com.google.common.truth.Truth8.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
+import static org.unicode.cldr.api.CldrValue.parseValue;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrValue;
+import org.unicode.cldr.tool.LikelySubtags;
+import org.unicode.cldr.util.LanguageTagCanonicalizer;
+import org.unicode.cldr.util.LocaleIDParser;
+import org.unicode.cldr.util.SupplementalDataInfo;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * Unit tests for the supplemental data API. These tests either use fake data for unit testing, or
+ * compare behaviour between this API and the equivalent CLDR utility tool for regression testing.
+ */
+@RunWith(JUnit4.class)
+public class SupplementalDataTest {
+ private static SupplementalData regressionData;
+ private static LikelySubtags likelySubtags;
+
+ @BeforeClass
+ public static void loadRegressionData() {
+ Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
+ regressionData = SupplementalData
+ .create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
+ SupplementalDataInfo sdi =
+ SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
+ likelySubtags = new LikelySubtags(sdi);
+ }
+
+ @Test
+ public void testGetParent_explicit() {
+ // Locales with an explicit (non truncation) parent (a.k.a "English is weird").
+ SupplementalData fakeData = fakeSupplementalData(parentLocales("en_001", "en_AU", "en_GB"));
+
+ assertThat(fakeData.getExplicitParentLocaleOf("en_GB")).hasValue("en_001");
+ assertThat(fakeData.getExplicitParentLocaleOf("en_AU")).hasValue("en_001");
+ assertThat(fakeData.getExplicitParentLocaleOf("en_US")).isEmpty();
+ assertThat(fakeData.getExplicitParentLocaleOf("en")).isEmpty();
+
+ assertThat(fakeData.getParent("en_GB")).isEqualTo("en_001");
+ assertThat(fakeData.getParent("en_AU")).isEqualTo("en_001");
+ assertThat(fakeData.getParent("en_001")).isEqualTo("en");
+ assertThat(fakeData.getParent("en_US")).isEqualTo("en");
+ assertThat(fakeData.getParent("en")).isEqualTo("root");
+
+ }
+
+ @Test
+ public void testGetParent_likelyScript() {
+ // To figure out default scripts we use likely subtags.
+ SupplementalData fakeData = fakeSupplementalData(likelySubtag("zh", "zh_Hans_CN"));
+
+ // When removing a non-default script, the parent become "root".
+ assertThat(fakeData.getParent("zh_Hant")).isEqualTo("root");
+ // "Hans" is recognized as the default script, so the parent is obtained via truncation.
+ assertThat(fakeData.getParent("zh_Hans")).isEqualTo("zh");
+ }
+
+ @Test
+ public void testMaximize() {
+ SupplementalData fakeData = fakeSupplementalData(
+ likelySubtag("en", "en_Latn_US"),
+ likelySubtag("pt", "pt_Latn_BR"),
+ likelySubtag("und", "en_Latn_US"));
+
+ // You cannot maximize "root".
+ assertThat(fakeData.maximize("root")).isEmpty();
+ // Existing subtags preserved.
+ assertThat(fakeData.maximize("en")).hasValue("en_Latn_US");
+ assertThat(fakeData.maximize("en_GB")).hasValue("en_Latn_GB");
+ assertThat(fakeData.maximize("en_VARIANT")).hasValue("en_Latn_US_VARIANT");
+ // Some other similar examples.
+ assertThat(fakeData.maximize("pt")).hasValue("pt_Latn_BR");
+ assertThat(fakeData.maximize("pt_PT")).hasValue("pt_Latn_PT");
+ assertThat(fakeData.maximize("und")).hasValue("en_Latn_US");
+ }
+
+ @Test
+ public void testReplaceDeprecatedTags_iAmRoot() {
+ SupplementalData fakeData = fakeSupplementalData();
+ assertThat(fakeData.replaceDeprecatedTags("root")).isEqualTo("root");
+ }
+
+ @Test
+ public void testReplaceDeprecatedTags_sameSubtags() {
+ SupplementalData fakeData = fakeSupplementalData(likelySubtag("en", "en_Latn_US"));
+
+ // Replacement does not minimize or maximize results (even though "Latn" is likely).
+ assertThat(fakeData.replaceDeprecatedTags("en_Latn_GB")).isEqualTo("en_Latn_GB");
+ assertThat(fakeData.replaceDeprecatedTags("en_GB")).isEqualTo("en_GB");
+ }
+
+ @Test
+ public void testReplaceDeprecatedTags_subtagReplacement() {
+ SupplementalData fakeData = fakeSupplementalData(
+ languageAlias("cym", "cy"),
+ scriptAlias("Qaai", "Zinh"),
+ territoryAlias("YU", "RS"));
+
+ // Region is deprecated
+ assertThat(fakeData.replaceDeprecatedTags("en_YU")).isEqualTo("en_RS");
+ // Script is deprecated
+ assertThat(fakeData.replaceDeprecatedTags("ar_Qaai_IR")).isEqualTo("ar_Zinh_IR");
+ // Language is deprecated
+ assertThat(fakeData.replaceDeprecatedTags("cym_GB")).isEqualTo("cy_GB");
+ }
+
+ @Test
+ public void testReplaceDeprecatedTags_complex() {
+ SupplementalData fakeData = fakeSupplementalData(
+ languageAlias("sh", "sr_Latn"),
+ languageAlias("zh_TW", "zh_Hant_TW"),
+ languageAlias("tzm_Latn_MA", "tzm_MA"),
+ territoryAlias("YU", "RS"),
+ likelySubtag("sr", "sr_Cyrl_RS"),
+ likelySubtag("zh_Hant", "zh_Hant_TW"));
+
+ // "sh" -> "sr_Latn", taking precedence over the fact that "sr" maximizes to "sr_Cyrl_RS".
+ assertThat(fakeData.replaceDeprecatedTags("sh_YU")).isEqualTo("sr_Latn_RS");
+ // Alias lookup can add tags however depending on the situation.
+ assertThat(fakeData.replaceDeprecatedTags("zh_TW")).isEqualTo("zh_Hant_TW");
+ // But it will NOT remove tags (even though the languageAlias table contains an entry from
+ // "tzm_Latn_MA" to "tzm_MA").
+ assertThat(fakeData.replaceDeprecatedTags("tzm_Latn_MA")).isEqualTo("tzm_Latn_MA");
+ }
+
+ @Test
+ public void testGetDefaultCalendar() {
+ SupplementalData fakeData = fakeSupplementalData(
+ defaultCalendar("gregorian", "001"),
+ defaultCalendar("persian", "AF"),
+ likelySubtag("uz", "uz_Latn_UZ"),
+ likelySubtag("uz_AF", "uz_Arab_AF"),
+ likelySubtag("uz_Arab", "uz_Arab_AF"));
+ assertThat(fakeData.getDefaultCalendar("root")).hasValue("gregorian");
+ // Empty because "gregorian" is the default found in the parent locale.
+ assertThat(fakeData.getDefaultCalendar("en_US")).isEmpty();
+ assertThat(fakeData.getDefaultCalendar("uz")).isEmpty();
+ assertThat(fakeData.getDefaultCalendar("uz_AF")).hasValue("persian");
+ assertThat(fakeData.getDefaultCalendar("uz_Arab")).hasValue("persian");
+ // Empty because "uz_Arab" defines the persian calendar.
+ assertThat(fakeData.getDefaultCalendar("uz_Arab_AF")).isEmpty();
+ }
+
+ @Test
+ public void testGetDefaultCalendar_secretHacks() {
+ SupplementalData fakeData = fakeSupplementalData(
+ defaultCalendar("gregorian", "001"),
+ likelySubtag("ja", "ja_Jpan_JP"),
+ likelySubtag("th", "th_Thai_TH"));
+ // Empty because "gregorian" is the default found in the parent locale.
+ assertThat(fakeData.getDefaultCalendar("ja_US")).isEmpty();
+ assertThat(fakeData.getDefaultCalendar("ja")).isEmpty();
+
+ // Traditional calendars for a region cannot be represented via the territory-only based
+ // CLDR data calendar mapping, so they exist as hard coded "hacks" in SupplementalData.
+ // They could be pulled out into the configuration API, but they should ideally just be
+ // derived from CLDR data directly.
+ assertThat(fakeData.getDefaultCalendar("ja_JP_TRADITIONAL")).hasValue("japanese");
+ assertThat(fakeData.getDefaultCalendar("ja_TRADITIONAL")).hasValue("japanese");
+ assertThat(fakeData.getDefaultCalendar("th_TH_TRADITIONAL")).hasValue("buddhist");
+ assertThat(fakeData.getDefaultCalendar("th_TRADITIONAL")).hasValue("buddhist");
+ }
+
+ @Test
+ public void testGetParent_regression() {
+ for (String id : TEST_LOCALE_IDS) {
+ assertWithMessage("id=%s", id)
+ .that(getIdChain(id, regressionData::getParent))
+ .isEqualTo(getIdChain(id, LocaleIDParser::getParent));
+ }
+ }
+
+ @Test
+ public void testMaximize_regression() {
+ for (String id : TEST_LOCALE_IDS) {
+ assertWithMessage("id=%s", id)
+ .that(regressionData.maximize(id).orElse(null))
+ .isEqualTo(likelySubtags.maximize(id));
+ }
+
+ // ars currently a special case since it's in the ICU data as an alias, but not in the CLDR
+ // data at all. This while it's a structurally valid language code, it cannot be maximized.
+ assertThat(regressionData.maximize("ars")).isEmpty();
+ }
+
+ @Test
+ public void testReplaceDeprecatedTags_regression() {
+ LanguageTagCanonicalizer ltc = new LanguageTagCanonicalizer();
+ for (String id : TEST_LOCALE_IDS) {
+ // Work around:
+ // https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13194
+ try {
+ ltc.transform(id);
+ } catch (NullPointerException e) {
+ System.out.println("--> " + id);
+ continue;
+ }
+ // Need to maximize to work around:
+ // https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13196
+ assertWithMessage("id=%s", id)
+ .that(regressionData.maximize(regressionData.replaceDeprecatedTags(id)).orElse(null))
+ .isEqualTo(likelySubtags.maximize(ltc.transform(id)));
+ }
+ }
+
+ private static Iterable<String> getIdChain(String id, Function<String, String> fn) {
+ List<String> chain = new ArrayList<>();
+ while (!id.equals("root")) {
+ chain.add(id);
+ id = fn.apply(id);
+ }
+ chain.add(id);
+ return chain;
+ }
+
+ private static final ImmutableSet<String> TEST_LOCALE_IDS = ImmutableSet.of(
+ "af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
+ "ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ", "ar_JO",
+ "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS", "ar_QA", "ar_SA",
+ "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars", "as", "as_IN",
+ "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ", "az_Latn",
+ "az_Latn_AZ", "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg",
+ "bg_BG", "bm", "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR",
+ "brx", "brx_IN", "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA", "ca",
+ "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU", "ceb",
+ "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs", "cs_CZ", "cy",
+ "cy_GB", "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
+ "de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
+ "dyo_SN", "dz", "dz_BT", "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR",
+ "en", "en_001", "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB",
+ "en_BE", "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
+ "en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI", "en_FJ",
+ "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM", "en_GU", "en_GY",
+ "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE", "en_JM", "en_KE", "en_KI",
+ "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG", "en_MH", "en_MO", "en_MP", "en_MS",
+ "en_MT", "en_MU", "en_MW", "en_MY", "en_NA", "en_NF", "en_NG", "en_NL", "en_NR", "en_NU",
+ "en_NZ", "en_PG", "en_PH", "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB",
+ "en_SC", "en_SD", "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ",
+ "en_TC", "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US",
+ "en_US_POSIX", "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
+ "eo_001", "es", "es_003", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
+ "es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN", "es_IC",
+ "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV", "es_US", "es_UY",
+ "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM", "fa", "fa_AF", "fa_IR", "ff",
+ "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM", "ff_Latn_GH", "ff_Latn_GM",
+ "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR", "ff_Latn_NE", "ff_Latn_NG",
+ "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi", "fi_FI", "fil", "fil_PH", "fo", "fo_DK",
+ "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI", "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF",
+ "fr_CG", "fr_CH", "fr_CI", "fr_CM", "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN",
+ "fr_GP", "fr_GQ", "fr_HT", "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML",
+ "fr_MQ", "fr_MR", "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC",
+ "fr_SN", "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
+ "fy", "fy_NL", "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR",
+ "gsw_LI", "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM", "ha", "ha_GH", "ha_NE", "ha_NG",
+ "haw", "haw_US", "he", "he_IL", "hi", "hi_IN", "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE",
+ "hu", "hu_HU", "hy", "hy_AM", "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN",
+ "in", "in_ID", "is", "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL", "ja",
+ "ja_JP", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID", "ka", "ka_GE", "kab", "kab_DZ",
+ "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV", "khq", "khq_ML", "ki", "ki_KE", "kk",
+ "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln", "kln_KE", "km", "km_KH", "kn", "kn_IN",
+ "ko", "ko_KP", "ko_KR", "kok", "kok_IN", "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM",
+ "ksh", "ksh_DE", "ku", "ku_TR", "kw", "kw_GB", "ky", "ky_KG", "lag", "lag_TZ", "lb",
+ "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO", "ln_CD", "ln_CF", "ln_CG", "lo",
+ "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT", "lu", "lu_CD", "luo", "luo_KE", "luy",
+ "luy_KE", "lv", "lv_LV", "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg",
+ "mg_MG", "mgh", "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN",
+ "mn", "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
+ "mua_CM", "my", "my_MM", "mzn", "mzn_IR", "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd",
+ "nd_ZW", "nds", "nds_DE", "nds_NL", "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ",
+ "nl_CW", "nl_NL", "nl_SR", "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no",
+ "no_NO", "nus", "nus_SS", "nyn", "nyn_UG", "om", "om_ET", "om_KE", "or", "or_IN", "os",
+ "os_GE", "os_RU", "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK",
+ "pl", "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
+ "pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL", "qu", "qu_BO", "qu_EC",
+ "qu_PE", "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
+ "ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ", "sah",
+ "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI", "se_NO", "se_SE",
+ "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA", "sh_CS", "sh_YU", "shi",
+ "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA", "shi_MA", "si", "si_LK", "sk",
+ "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn", "sn_ZW", "so", "so_DJ", "so_ET", "so_KE",
+ "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK", "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME",
+ "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK", "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA",
+ "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS", "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME",
+ "sr_RS", "sr_CS", "sr_YU", "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ",
+ "sw_UG", "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
+ "tg", "tg_TJ", "th", "th_TH", "ti", "ti_ER", "ti_ET", "tk", "tk_TM", "tl", "tl_PH", "to",
+ "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU", "twq", "twq_NE", "tzm", "tzm_MA", "ug",
+ "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab", "uz_Arab_AF",
+ "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ", "vai", "vai_Latn", "vai_Latn_LR",
+ "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi", "vi_VN", "vun", "vun_TZ", "wae", "wae_CH", "wo",
+ "wo_SN", "xh", "xh_ZA", "xog", "xog_UG", "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ",
+ "yo_NG", "yue", "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK", "zgh", "zgh_MA", "zh",
+ "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO", "zh_Hans_SG", "zh_Hant", "zh_Hant_HK",
+ "zh_Hant_MO", "zh_Hant_TW", "zh_CN", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
+
+ private static CldrValue parentLocales(String parent, String... locales) {
+ return supplementalData(
+ "parentLocales/parentLocale[@parent=\"%s\"][@locales=\"%s\"]",
+ parent, Joiner.on(' ').join(locales));
+ }
+
+ private static CldrValue defaultCalendar(String calendar, String... territories) {
+ return supplementalData(
+ "calendarPreferenceData/calendarPreference[@territories=\"%s\"][@ordering=\"%s\"]",
+ Joiner.on(' ').join(territories), calendar);
+ }
+
+ private static CldrValue likelySubtag(String from, String to) {
+ return supplementalData(
+ "likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]", from, to);
+ }
+
+ private static CldrValue languageAlias(String type, String replacement) {
+ return supplementalData(
+ "metadata/alias/languageAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+ }
+
+ private static CldrValue scriptAlias(String type, String replacement) {
+ return supplementalData(
+ "metadata/alias/scriptAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+ }
+
+ private static CldrValue territoryAlias(String type, String replacement) {
+ return supplementalData(
+ "metadata/alias/territoryAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
+ }
+
+ private static CldrValue supplementalData(String path, Object... args) {
+ return parseValue(String.format("//supplementalData/" + path, args), "");
+ }
+
+ private static SupplementalData fakeSupplementalData(CldrValue... values) {
+ return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.regex;
+
+import static com.google.common.truth.Truth.assertThat;
+import static java.util.Arrays.asList;
+import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
+import static org.unicode.icu.tool.cldrtoicu.testing.ResultSubjectFactory.assertThat;
+
+import java.util.List;
+
+import javax.annotation.concurrent.Immutable;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.unicode.cldr.api.CldrPath;
+import org.unicode.cldr.api.CldrValue;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+/**
+ * Tests for the regex transformer class. Note that in most cases, the rules used here are taken
+ * directly from one of the config files, simply because it avoids having to invent valid paths
+ * for testing (and we still need "real" CLDR paths since the path parsing verifies attributes
+ * against the DTD metadata). Basing tests on real rules illustrates that all of these tests are
+ * asserting about relied-upon behaviour, however there is nothing inherently special about these
+ * paths.
+ */
+@RunWith(JUnit4.class)
+public class RegexTransformerTest {
+ @Test
+ public void testSingleResults_singleCapture() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "%W=[\\w\\-]++",
+ "//ldml/numbers/defaultNumberingSystem[@alt=\"(%A)\"] ; /NumberElements/default_$1",
+ "//ldml/numbers/defaultNumberingSystem ; /NumberElements/default",
+ "//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1");
+
+ CldrValue defaultNumberingSystem =
+ CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem", "foobar");
+ assertSingleResult(
+ transformer.transform(defaultNumberingSystem), "NumberElements/default", "foobar");
+
+ CldrValue altNumberingSystem =
+ CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem[@alt=\"foo\"]", "bar");
+ assertSingleResult(
+ transformer.transform(altNumberingSystem), "NumberElements/default_foo", "bar");
+
+ CldrValue otherNumberingSystems =
+ CldrValue.parseValue("//ldml/numbers/otherNumberingSystems/finance", "foo bar");
+ assertSingleResult(
+ transformer.transform(otherNumberingSystems), "NumberElements/finance", "foo bar");
+ }
+
+ @Test
+ public void testSingleResults_multipleCapture() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "//ldml/characters"
+ + "/parseLenients[@scope=\"(%A)\"][@level=\"(%A)\"]"
+ + "/parseLenient[@sample=\"%A\"]"
+ + " ; /parse/$1/$2");
+
+ CldrValue lenient = CldrValue.parseValue(
+ "//ldml/characters"
+ + "/parseLenients[@scope=\"general\"][@level=\"lenient\"]"
+ + "/parseLenient[@sample=\"ignored\"]",
+ "foo");
+ assertSingleResult(
+ transformer.transform(lenient), "/parse/general/lenient", "foo");
+
+ CldrValue stricter = CldrValue.parseValue(
+ "//ldml/characters"
+ + "/parseLenients[@scope=\"number\"][@level=\"stricter\"]"
+ + "/parseLenient[@sample=\"ignored\"]",
+ "bar");
+ assertSingleResult(
+ transformer.transform(stricter), "/parse/number/stricter", "bar");
+ }
+
+ @Test
+ public void testMultipleResults() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"numeric\"][@id=\"(%W)\"][@digits=\"(%A)\"]",
+ " ; /numberingSystems/$1/algorithmic:int ; values=0",
+ " ; /numberingSystems/$1/desc ; values=$2",
+ " ; /numberingSystems/$1/radix:int ; values=10");
+
+ CldrValue value = CldrValue.parseValue(
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"numeric\"][@id=\"foo\"][@digits=\"bar\"]",
+ "");
+ ImmutableList<Result> results = transformer.transform(value);
+ assertThat(results).hasSize(3);
+ assertThat(results.get(0)).hasKey("/numberingSystems/foo/algorithmic:int");
+ assertThat(results.get(0)).hasValues("0");
+ assertThat(results.get(0)).isGrouped(false);
+
+ assertThat(results.get(1)).hasKey("/numberingSystems/foo/desc");
+ assertThat(results.get(1)).hasValues("bar");
+ assertThat(results.get(1)).isGrouped(false);
+
+ assertThat(results.get(2)).hasKey("/numberingSystems/foo/radix:int");
+ assertThat(results.get(2)).hasValues("10");
+ assertThat(results.get(2)).isGrouped(false);
+ }
+
+ @Test
+ public void testImplicitArgumentSplitting() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/gender/personList[@type=\"(%W)\"][@locales=\"(%W)\"]"
+ + " ; /genderList/$2 ; values=$1",
+ "//supplementalData/windowsZones/mapTimezones"
+ + "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+ + " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
+
+ // Implicit splitting is based on the first unquoted placeholder in the output path ($2 in
+ // this case) and not the first captured group of the input path.
+ CldrValue personList = CldrValue.parseValue(
+ "//supplementalData/gender/personList[@type=\"neutral\"][@locales=\"xx yy zz\"]", "");
+ ImmutableList<Result> results = transformer.transform(personList);
+ assertThat(results).hasSize(3);
+ assertThat(results.get(0)).hasKey("/genderList/xx");
+ assertThat(results.get(0)).hasValues("neutral");
+ assertThat(results.get(1)).hasKey("/genderList/yy");
+ assertThat(results.get(1)).hasValues("neutral");
+ assertThat(results.get(2)).hasKey("/genderList/zz");
+ assertThat(results.get(2)).hasValues("neutral");
+
+ // Quoting prevents the first captured argument with spaces from triggering multiple
+ // results (it will trigger on the first un-quoted argument in the output path). This
+ // quoting must appear in the output however since spaces are "structural" in paths in
+ // ICU data files.
+ CldrValue mapZone = CldrValue.parseValue(
+ "//supplementalData/windowsZones/mapTimezones/mapZone"
+ + "[@type=\"foo\"]"
+ + "[@other=\"not split\"]"
+ + "[@territory=\"XX YY ZZ\"]",
+ "");
+ results = transformer.transform(mapZone);
+ assertThat(results).hasSize(3);
+ assertThat(results.get(0)).hasKey("/mapTimezones/\"not split\"/XX");
+ assertThat(results.get(2)).hasValues("foo");
+ assertThat(results.get(1)).hasKey("/mapTimezones/\"not split\"/YY");
+ assertThat(results.get(2)).hasValues("foo");
+ assertThat(results.get(2)).hasKey("/mapTimezones/\"not split\"/ZZ");
+ assertThat(results.get(2)).hasValues("foo");
+ }
+
+ @Test
+ public void testValueSplitting() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/parentLocales/parentLocale[@parent=\"(%A)\"][@locales=\"(%A)\"]"
+ + " ; /parentLocales/$1 ; values=$2",
+ "//supplementalData/windowsZones/mapTimezones"
+ + "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+ + " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
+
+ // Because the value is expressed via an explicit values instruction, it is split by space.
+ CldrValue parentLocale = CldrValue.parseValue(
+ "//supplementalData/parentLocales"
+ + "/parentLocale[@parent=\"foo\"][@locales=\"value is split\"]",
+ "");
+ assertSingleResult(transformer.transform(parentLocale),
+ "/parentLocales/foo", "value", "is", "split");
+
+ // However if a placeholder is quoted in the value instruction, it is not split.
+ CldrValue mapZone = CldrValue.parseValue(
+ "//supplementalData/windowsZones/mapTimezones/mapZone"
+ + "[@type=\"value is not split\"]"
+ + "[@other=\"foo\"]"
+ + "[@territory=\"XX\"]",
+ "");
+ assertSingleResult(transformer.transform(mapZone),
+ "/mapTimezones/\"foo\"/XX", "value is not split");
+ }
+
+ @Test
+ public void testResultFunctionCalling() {
+ List<String> configLines = asList(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
+ " ; /numberingSystems/foo ; values=&swap( $1 , $2 ) $3",
+ " ; /numberingSystems/bar ; values=\"&swap( $1, quux )\"",
+ " ; /numberingSystems/baz ; values=\"&swap( $1-$2, $3{value} )\"");
+
+ CldrValue numberingSystem = CldrValue.parseValue(
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
+ "-VALUE");
+
+ // Note that joining with a space is rather a trivial function, but it does illustrate that
+ // a function's output is still subject to value splitting unless quoted. In fact a common
+ // function (&ymd) is used to split year/month/day strings using spaces exactly so they are
+ // treated as separate values.
+ // Note also that the spaces around the arguments to the function are ignored however.
+ NamedFunction swapFn =
+ NamedFunction.create("swap", 2, args -> args.get(1) + " " + args.get(0));
+ PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, swapFn);
+ ImmutableList<Result> results = transformer.transform(numberingSystem);
+
+ assertThat(results).hasSize(3);
+ assertThat(results.get(0)).hasValues("bar", "foo", "baz");
+ assertThat(results.get(1)).hasValues("quux foo");
+ assertThat(results.get(2)).hasValues("baz-VALUE foo-bar");
+ }
+
+ @Test
+ public void testResultFunctionCalling_edgeCases() {
+ List<String> configLines = asList(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
+ " ; /numberingSystems/foo ; values=\"&join( {value} , $1 $2 $3, {value} )\"");
+
+ // This illustrates a fundamental problem with the way that quoting and splitting is
+ // defined in this config language. Splitting is always down after value substitution,
+ // which is just done as a single pass. This, if a value has a double-quote in it can
+ // upset the quoting behaviour in odd ways. Here it prevents the outermost quoting from
+ // working and results in multiple values where there should be one.
+ //
+ // To fix this, the implicit splitting should be replaced by a "split()" function and the
+ // rules should be parsed into something approximating a proper expression AST.
+ CldrValue badValue = CldrValue.parseValue(
+ "//supplementalData/numberingSystems"
+ + "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
+ "<< \" >>");
+
+ NamedFunction joinFn =
+ NamedFunction.create("join", 3, args -> args.get(0) + args.get(1) + args.get(2));
+ PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, joinFn);
+ ImmutableList<Result> results = transformer.transform(badValue);
+ // If outer quoting worked, this would be a single value, not five.
+ assertSingleResult(results, "/numberingSystems/foo", "<< ", ">>foo", "bar", "baz<<", " >>");
+ }
+
+ @Test
+ public void testDynamicVars() {
+ PathValueTransformer transformer = transformer(
+ "%W=[\\w\\-]++",
+ "%D=//ldml/numbers/defaultNumberingSystem",
+ "//ldml/numbers/currencyFormats[@numberSystem=\"%D\"]/currencySpacing/(%W)/(%W)",
+ " ; /currencySpacing/$1/$2");
+ CldrValue cldrValue = CldrValue.parseValue(
+ "//ldml/numbers/currencyFormats[@numberSystem=\"latn\"]"
+ + "/currencySpacing/beforeCurrency/currencyMatch",
+ "format");
+ // The path we expect to be resolved by the dynamic variable function.
+ CldrPath expectedPath =
+ CldrPath.parseDistinguishingPath("//ldml/numbers/defaultNumberingSystem");
+ ImmutableList<Result> format = transformer.transform(cldrValue, p -> {
+ assertThat(p).isEqualTo(expectedPath);
+ return "latn";
+ });
+ assertSingleResult(format, "/currencySpacing/beforeCurrency/currencyMatch", "format");
+ }
+
+ @Test
+ public void testFallbacks_simple() {
+ PathValueTransformer transformer = transformer(
+ "%W=[\\w\\-/]++",
+ "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol"
+ + " ; /Currencies/$1 ; fallback=$1",
+ "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/displayName"
+ + " ; /Currencies/$1 ; fallback=$1");
+
+ ImmutableList<Result> symbol = transformer.transform(
+ CldrValue.parseValue(
+ "//ldml/numbers/currencies/currency[@type=\"Foo\"]/symbol", "symbol"));
+ assertSingleResult(symbol, "Currencies/Foo", "symbol");
+ ImmutableList<Result> name = transformer.transform(
+ CldrValue.parseValue(
+ "//ldml/numbers/currencies/currency[@type=\"Foo\"]/displayName", "name"));
+ assertSingleResult(name, "Currencies/Foo", "name");
+
+ RbPath rbPath = RbPath.of("Currencies", "Foo");
+ ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+ assertThat(fallbacks).hasSize(2);
+
+ // Both fallbacks look like they are equal, but they didn't come from the same rule...
+ assertThat(fallbacks.get(0)).hasKey(rbPath);
+ assertThat(fallbacks.get(0)).hasValues("Foo");
+ assertThat(fallbacks.get(1)).hasKey(rbPath);
+ assertThat(fallbacks.get(1)).hasValues("Foo");
+
+ // ... so they correspond to different matched results.
+ assertThat(fallbacks.get(0).isFallbackFor(symbol.get(0))).isTrue();
+ assertThat(fallbacks.get(1).isFallbackFor(symbol.get(0))).isFalse();
+
+ assertThat(fallbacks.get(0).isFallbackFor(name.get(0))).isFalse();
+ assertThat(fallbacks.get(1).isFallbackFor(name.get(0))).isTrue();
+
+ // And they are ordered by their appearance in the configuration file.
+ assertThat(fallbacks.get(0)).isLessThan(fallbacks.get(1));
+
+ // BUT (and this is important) the fallback results are "equal". This is necessary for
+ // other situations where results are generated from different rules but should be
+ // considered "equal" for purposes of deduplication. Deduplication doesn't affect this
+ // situation though (but it's worth being explicit in this test). This is all a bit subtle
+ // and should be fixed properly at some point. See also "testBaseXpath()".
+ assertThat(fallbacks.get(0)).isEqualTo(fallbacks.get(1));
+ }
+
+ @Test
+ public void testFallbacks_multipleArgs() {
+ PathValueTransformer transformer = transformer(
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/calendarData"
+ + "/calendar[@type=\"(%W)\"]/eras/era[@type=\"(%W)\"][@(start|end)=\"(%A)\"]",
+ " ; /fake/$2/$4/$1/$3 ; fallback=$1 $2 $3 $4 $3 $2 $1");
+ // Path elements match the $N indices so it's easy to see how reordering happens.
+ RbPath rbPath = RbPath.of("fake", "two", "four", "one", "three");
+ // This shows that the capturing of arguments done on the resource bundle path for the
+ // fallback correctly reordered the arguments. Having this many reordered arguments in a
+ // fallback is not something that really happens in the actual config files currently, but
+ // it's complex logic and needs to be tested. Note also how captured arguments can appear
+ // multiple times in the result.
+ assertSingleResult(
+ transformer.getFallbackResultsFor(rbPath, p -> null),
+ rbPath,
+ "one", "two", "three", "four", "three", "two", "one");
+ }
+
+ @Test
+ public void testFallbacks_valueSplitting() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+ " ; /fake/$1/$2 ; fallback=$1 and $2");
+
+ RbPath rbPath = RbPath.of("fake", "Foo", "Bar");
+ ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+ assertSingleResult(fallbacks, rbPath, "Foo", "and", "Bar");
+ }
+
+ @Test
+ public void testFallbacks_missingArgs() {
+ IllegalStateException e = assertThrows(
+ IllegalStateException.class,
+ () -> transformer(
+ "%A=[^\"']++",
+ "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+ " ; /$1 ; fallback=$2"));
+ // A bit brittle, but this message is important for debugging.
+ assertThat(e).hasMessageThat()
+ .contains("fallback values may only contain arguments from the resource bundle path");
+ assertThat(e).hasMessageThat().contains("$2");
+ }
+
+ @Test
+ public void testFallbacks_noValueSubstitution() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+ " ; /$1 ; fallback=$1-{value}");
+
+ RbPath rbPath = RbPath.of("Foo");
+ ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+ // The {value} token is not substituted in a fallback because there is not value.
+ // TODO: Make this into an error (since it's only ever going to happen by mistake)!
+ assertSingleResult(fallbacks, rbPath, "Foo-{value}");
+ }
+
+ @Test
+ public void testFallbacks_noQuotingSupport() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
+ " ; /fake/$1 ; fallback=\"$1\"");
+
+ RbPath rbPath = RbPath.of("fake", "Foo");
+ ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
+ // Fallbacks could support quoting of placeholders, but to match legacy behaviour,
+ // they don't yet. As it is you cannot prevent fallback values being split on spaces.
+ assertSingleResult(fallbacks, rbPath, "\"Foo\"");
+ }
+
+ @Test
+ public void testHiddenLabelsAndMetazones() {
+ PathValueTransformer transformer = transformer(
+ "%A=[^\"']++",
+ "%W=[\\s\\w\\-/]++",
+ "//supplementalData/metaZones/metazoneInfo"
+ + "/timezone[@type=\"(%W)\"]/usesMetazone[@mzone=\"(%W)\"]"
+ + " ; /metazoneInfo/\"$1\"/<$2> ; values=$2",
+ "//supplementalData/metaZones/metazoneInfo"
+ + "/timezone[@type=\"(%W)\"]/usesMetazone[@to=\"(%A)\"][@mzone=\"(%W)\"]"
+ + " ; /metazoneInfo/\"$1\"/<1970-01-01 00:00> ; values=$3 \"1970-01-01 00:00\" \"$2\"");
+
+ ImmutableList<Result> parisTz = transformPath(
+ transformer,
+ "//supplementalData/metaZones/metazoneInfo"
+ + "/timezone[@type=\"Europe/Paris\"]/usesMetazone[@mzone=\"Europe_Central\"]");
+
+ // The conversion from "Europe/Paris" to "Europe:Paris" is a built in special case when
+ // quoting values with '/' in. It's only actually necessary for these timezone identifiers,
+ // but the code is applied everywhere since that's easier. Ideally there'd be something
+ // like the function calling mechanism to make this transformation explicit, but at the
+ // moment, the output resource bunder paths have no way to control the transformation of
+ // substituted arguments, so it has to be built in.
+ assertSingleResult(
+ parisTz, "/metazoneInfo/\"Europe:Paris\"/<Europe_Central>", "Europe_Central");
+
+ ImmutableList<Result> britishTz = transformPath(
+ transformer,
+ "//supplementalData/metaZones/metazoneInfo"
+ + "/timezone[@type=\"Europe/London\"]"
+ + "/usesMetazone[@to=\"1971-10-31 02:00\"][@mzone=\"Europe_Central\"]");
+
+ // This example demonstrates that things like ' ' or ':' (normally prohibited in resource
+ // bundle path elements) are acceptable in hidden labels, since those will be stripped out
+ // while writing the resulting data file. The date-time values are quoted in the rule to
+ // ensure they are not split.
+ assertSingleResult(
+ britishTz,
+ "/metazoneInfo/\"Europe:London\"/<1970-01-01 00:00>",
+ "Europe_Central", "1970-01-01 00:00", "1971-10-31 02:00");
+ }
+
+ @Test
+ public void testBaseXpath() {
+ PathValueTransformer transformer = transformer(
+ "%W=[\\s\\w\\-/]++",
+ "%N=[\\d\\.]++",
+
+ // In the real data, these rules define multiple results which reflect the actual
+ // differences in the child elements, but the one tested is is only based on the
+ // <territory> path prefix, which is the same for many child elements (which is all
+ // that's ever actually transformed).
+ //
+ // So for a single path prefix you'll generate multiple identical results which need
+ // to be de-duplicated, which can only happen if they are considered to have come
+ // from the same source (since duplicate results happen all the time in general).
+ //
+ // This is what the base xpath does, it fakes a different source CLDR path which makes
+ // the results "equal" (even though they came from different CLDR paths sources).
+ "//supplementalData/territoryInfo"
+ + "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+ + "/languagePopulation[@type=\"(%W)\"][@populationPercent=\"(%N)\"]",
+ " ; /territoryInfo/$1/territoryF:intvector"
+ + " ; values=$2 $3 $4"
+ + " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]",
+
+ // Same thing but with child element containing "writingPercent".
+ "//supplementalData/territoryInfo"
+ + "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+ + "/languagePopulation[@type=\"(%W)\"][@writingPercent=\"(%N)\"][@populationPercent=\"(%N)\"]",
+ " ; /territoryInfo/$1/territoryF:intvector"
+ + " ; values=$2 $3 $4"
+ + " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]");
+
+ String commonPrefix =
+ "//supplementalData/territoryInfo"
+ + "/territory[@type=\"CI\"][@gdp=\"97160000000\"][@literacyPercent=\"57\"][@population=\"26260600\"]";
+
+ ImmutableList<Result> firstResult = transformPath(
+ transformer,
+ commonPrefix + "/languagePopulation[@type=\"kfo\"][@populationPercent=\"0.3\"]");
+
+ ImmutableList<Result> secondResult = transformPath(
+ transformer,
+ commonPrefix + "/languagePopulation[@type=\"sef\"][@writingPercent=\"5\"][@populationPercent=\"4\"]");
+
+ assertSingleResult(
+ firstResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
+ assertSingleResult(
+ secondResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
+
+ // Even though they come from different rules, these results are treated as interchangeably
+ // equal because the base path is the same. Without the base path this would not be equal.
+ assertThat(firstResult).isEqualTo(secondResult);
+ }
+
+ @Test
+ public void testResultGrouping() {
+ PathValueTransformer transformer = transformer(
+ "%W=[\\w\\-/]++",
+ "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol ; /Currencies/$1",
+ "//ldml/numbers/currencies/currency[@type=\"(%W)\"]/decimal ; /Currencies/$1 ; group");
+
+ Result ungrouped = transformSingleResult(
+ transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/symbol", "$");
+ Result grouped = transformSingleResult(
+ transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/decimal", ".");
+
+ // Note that grouping is important for some data, but isn't very interesting at the basic
+ // transformation level (it's just a bit). It's only interesting when the converter
+ // combines multiple results together.
+ assertThat(ungrouped).isGrouped(false);
+ assertThat(grouped).isGrouped(true);
+ }
+
+ private static PathValueTransformer transformer(String... configLines) {
+ return RegexTransformer.fromConfigLines(asList(configLines));
+ }
+
+ private static ImmutableList<Result> transformPath(
+ PathValueTransformer transformer, String cldrPath) {
+
+ return transformer.transform(CldrValue.parseValue(cldrPath, ""));
+ }
+
+ private static Result transformSingleResult(
+ PathValueTransformer transformer, String path, String value) {
+
+ ImmutableList<Result> results =
+ transformer.transform(CldrValue.parseValue(path, value));
+ assertThat(results).hasSize(1);
+ return results.get(0);
+ }
+
+ private static void assertSingleResult(List<Result> results, RbPath path, String... values) {
+ assertThat(results).hasSize(1);
+ assertThat(results.get(0)).isGrouped(false);
+ assertThat(results.get(0)).hasKey(path);
+ assertThat(results.get(0)).hasValues(values);
+ }
+
+ private static void assertSingleResult(List<Result> results, String path, String... values) {
+ assertSingleResult(results, RbPath.parse(path), values);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static org.junit.Assert.fail;
+
+/** Static assertion helpers (some of which can be removed if JUnit version is updated). */
+public final class AssertUtils {
+ // Functional interface acting as a lambda target.
+ public interface CheckedRunnable<T extends Throwable> {
+ void run() throws T;
+ }
+
+ /** Asserts that an exception is thrown by a given runnable. */
+ public static <T extends Throwable> T assertThrows(Class<T> cls, CheckedRunnable<T> fn) {
+ try {
+ fn.run();
+ } catch (Throwable t) {
+ if (cls.isInstance(t)) {
+ return cls.cast(t);
+ }
+ fail("expected " + cls.getName() + " but got " + t.getClass().getName());
+ }
+ fail("expected " + cls.getName() + " but nothing was thrown");
+ throw new AssertionError("unreachable!");
+ }
+
+ private AssertUtils() {}
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+public final class RbPathSubject extends Subject {
+ // For use when chaining from other subjects.
+ public static Subject.Factory<RbPathSubject, RbPath> rbPaths() {
+ return RbPathSubject::new;
+ }
+
+ private final RbPath actual;
+
+ protected RbPathSubject(FailureMetadata metadata, RbPath actual) {
+ super(metadata, actual);
+ this.actual = actual;
+ }
+
+ /** Asserts the value of the path, as segments (use this if a segment can contain '/'). */
+ public final void hasSegments(String... segments) {
+ check("<segments>").that(actual).isEqualTo(RbPath.of(segments));
+ }
+
+ public final void hasLength(int n) {
+ checkArgument(n >= 0, "invalid path length: %s", n);
+ check("length()").that(actual.length()).isEqualTo(n);
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import com.google.common.truth.Truth;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+/** Truth subject for asserting about resource bundle paths (makes tests much more readable). */
+public final class RbPathSubjectFactory implements Subject.Factory<RbPathSubject, RbPath> {
+ public static RbPathSubject assertThat(RbPath result) {
+ return Truth.assertAbout(new RbPathSubjectFactory()).that(result);
+ }
+
+ @Override
+ public RbPathSubject createSubject(FailureMetadata failureMetadata, RbPath that) {
+ return new RbPathSubject(failureMetadata, that);
+ }
+
+ RbPathSubjectFactory() {}
+}
\ No newline at end of file
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+import org.unicode.icu.tool.cldrtoicu.RbPath;
+
+import com.google.common.truth.ComparableSubject;
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.IterableSubject;
+import com.google.common.truth.Subject;
+
+public final class ResultSubject extends ComparableSubject<Result> {
+ // For use when chaining from other subjects.
+ public static Subject.Factory<ResultSubject, Result> results() {
+ return ResultSubject::new;
+ }
+
+ private final Result actual;
+
+ protected ResultSubject(FailureMetadata metadata, Result result) {
+ super(metadata, checkNotNull(result));
+ this.actual = result;
+ }
+
+ public final void isGrouped(boolean grouped) {
+ if (grouped != actual.isGrouped()) {
+ check("isGrouped()").that(actual.isGrouped()).isEqualTo(grouped);
+ }
+ }
+
+ public final IterableSubject hasValueListThat() {
+ return check("getValues()").that(actual.getValues());
+ }
+
+ public final void hasValues(String... values) {
+ hasValueListThat().containsExactlyElementsIn(values);
+ }
+
+ public final RbPathSubject hasKeyThat() {
+ return check("getKey()").about(RbPathSubject.rbPaths()).that(actual.getKey());
+ }
+
+ public final void hasKey(RbPath path) {
+ hasKeyThat().isEqualTo(path);
+ }
+
+ public final void hasKey(String path) {
+ hasKey(RbPath.parse(path));
+ }
+}
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.testing;
+
+import com.google.common.truth.FailureMetadata;
+import com.google.common.truth.Subject;
+import com.google.common.truth.Truth;
+import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
+
+/** Truth subject for asserting about transformation results (makes tests much more readable). */
+public class ResultSubjectFactory implements Subject.Factory<ResultSubject, Result> {
+ public static ResultSubject assertThat(Result result) {
+ return Truth.assertAbout(new ResultSubjectFactory()).that(result);
+ }
+
+ @Override
+ public ResultSubject createSubject(FailureMetadata failureMetadata, Result that) {
+ return new ResultSubject(failureMetadata, that);
+ }
+
+ private ResultSubjectFactory() {}
+}
\ No newline at end of file