</classpath>
</taskdef>
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
- minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
+ outputTypes="${outputTypes}" minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
- <!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
- TODO: Find out and document this properly. -->
- <forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
-
- <!-- This appears to be a hack to avoid needing to copy and maintain the same "zh"
- data for "yue". The files for "yue" in this directory should be empty otherwise.
- The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
- "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
- rewriting the base language. -->
- <forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
- <forcedAlias dir="coll" source="yue" target="zh_Hant"/>
+ <!-- The primary set of locale IDs to be generated by default. The IDs in this list are
+ automatically expanded to include default scripts and all available regions. The
+ rules are:
- <!-- It is not at all clear why this is being done. It's certainly not exactly the same
- as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
- data than "yue", so this alias is not just rewriting the base language.
- TODO: Find out and document this properly. -->
- <forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
+ 1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
+ 2) All region and variant subtags are added for any base language or language+script
+ (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").
- <!-- The primary set of locale IDs to be generated. Other, directory specific, sets exist
- and do not have to be subsets of this. Some of these ID are aliases, so XML files
- may not exist for all of them. -->
- <!-- TODO: Add locale ID inference to reduce this list considerably. -->
- <localeIds dirs="curr,lang,locales,region,unit,zone">
- root,
+ If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").
+ Locale IDs with deprecated subtags (which become aliases) must still be listed in
+ full (e.g. "en_RH" or "sr_Latn_YU").
+ -->
+ <localeIds>
// A
- af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001,
- ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ,
- ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS,
- ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, ars,
- as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl, az_Cyrl_AZ,
- az_Latn, az_Latn_AZ,
+ af, agq, agq_CM, ak, am, ar, ars, as, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl
// B
- bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm,
- bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN,
- bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, bs_BA,
+ bas, bas_CM, be, bem, bem_ZM, bez, bez_TZ, bg, bm, bn, bo, br, brx, brx_IN, bs, bs_BA
+ bs_Cyrl
// C
- ca, ca_AD, ca_ES, ca_FR, ca_IT, ccp, ccp_BD, ccp_IN, ce, ce_RU,
- ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs,
- cs_CZ, cy, cy_GB,
+ ca, ccp, ccp_BD, ccp_IN, ce, ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cy
// D
- da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE,
- de_IT, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo,
- dyo_SN, dz, dz_BT,
+ da, dav, dav_KE, de, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz
// E
- ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001,
- en_150, en_AE, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE,
- en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK,
- en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI,
- en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM,
- en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE,
- en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG,
- en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA,
- en_NF, en_NG, en_NH, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH,
- en_PK, en_PN, en_PR, en_PW, en_RH, en_RW, en_SB, en_SC, en_SD,
- en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC,
- en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX,
- en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo,
- eo_001, es, es_419, es_AR, es_BO, es_BR, es_BZ, es_CL, es_CO,
- es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN,
- es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV,
- es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM,
+ ebu, ebu_KE, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo, ewo_CM
// F
- fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_Latn, ff_Latn_BF, ff_Latn_CM,
- ff_Latn_GH, ff_Latn_GM, ff_Latn_GN, ff_Latn_GW, ff_Latn_LR, ff_Latn_MR,
- ff_Latn_NE, ff_Latn_NG, ff_Latn_SL, ff_Latn_SN, ff_MR, ff_SN, fi,
- fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI,
- fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM,
- fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT,
- fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR,
- fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN,
- fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT,
- fy, fy_NL,
+ fa, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fil_PH, fo, fr, fur, fur_IT, fy
// G
- ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI,
- gu, gu_IN, guz, guz_KE, gv, gv_IM,
+ ga, gd, gl, gsw, gsw_CH, gsw_FR, gsw_LI, gu, guz, guz_KE, gv
// H
- ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN,
- hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM,
+ ha, haw, haw_US, he, hi, hr, hsb, hsb_DE, hu, hy
// I
- ia, ia_001, id, id_ID, ig, ig_NG, ii, ii_CN, in, in_ID, is,
- is_IS, it, it_CH, it_IT, it_SM, it_VA, iw, iw_IL,
+ ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL
// J
- ja, ja_JP, ja_JP_TRADITIONAL, jgo, jgo_CM, jmc, jmc_TZ, jv, jv_ID,
+ ja, jgo, jgo_CM, jmc, jmc_TZ, jv
// K
- ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV,
- khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln,
- kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN,
- ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, ku_TR,
- kw, kw_GB, ky, ky_KG,
+ ka, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, kk, kkj, kkj_CM, kl
+ kln, kln_KE, km, kn, ko, kok, kok_IN, ks, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, kw
+ ky
// L
- lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO,
- ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT,
- lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV,
+ lag, lag_TZ, lb, lg, lkt, lkt_US, ln, lo, lrc, lrc_IQ, lrc_IR, lt, lu, luo, luo_KE, luy
+ luy_KE, lv
// M
- mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh,
- mgh_MZ, mgo, mgo_CM, mi, mi_NZ, mk, mk_MK, ml, ml_IN, mn,
- mn_MN, mo, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua,
- mua_CM, my, my_MM, mzn, mzn_IR,
+ mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mgh, mgh_MZ, mgo, mgo_CM, mi, mk, ml, mn
+ mo, mr, ms, mt, mua, mua_CM, my, mzn, mzn_IR
// N
- naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, nds, nds_DE, nds_NL,
- ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR,
- nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, no, no_NO, no_NO_NY,
- nus, nus_SS, nyn, nyn_UG,
+ naq, naq_NA, nb, nd, nds, nds_DE, nds_NL, ne, nl, nmg, nmg_CM, nn, nnh, nnh_CM, no, no_NO
+ no_NO_NY, nus, nus_SS, nyn, nyn_UG
// O
- om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU,
+ om, or, os
// P
- pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pa_IN, pa_PK, pl,
- pl_PL, ps, ps_AF, ps_PK, pt, pt_AO, pt_BR, pt_CH, pt_CV, pt_GQ,
- pt_GW, pt_LU, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL,
+ pa, pa_Arab, pa_IN, pa_PK, pl, ps, pt
// Q
- qu, qu_BO, qu_EC, qu_PE,
+ qu
// R
- rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, ru,
- ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ,
+ rm, rn, ro, rof, rof_TZ, ru, rw, rwk, rwk_TZ
// S
- sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, sd_PK, se, se_FI,
- se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, sh, sh_BA,
- sh_CS, sh_YU, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA,
- shi_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW,
- so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr,
- sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_CS, sr_Cyrl_XK,
- sr_Cyrl_YU, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_CS,
- sr_Latn_XK, sr_Latn_YU, sr_BA, sr_ME, sr_RS, sr_CS, sr_XK, sr_YU,
- sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG,
+ sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, se, seh, seh_MZ, ses, ses_ML, sg, sh, sh_BA, sh_CS
+ sh_YU, shi, shi_Latn, shi_Latn_MA, shi_MA, shi_Tfng, shi_Tfng_MA, si, sk, sl, smn, smn_FI, sn, so, sq, sr
+ sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn, sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, sv, sw
// T
- ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG,
- tg, tg_TJ, th, th_TH, th_TH_TRADITIONAL, ti, ti_ER, ti_ET, tk,
- tk_TM, tl, tl_PH, to, to_TO, tr, tr_CY, tr_TR, tt, tt_RU,
- twq, twq_NE, tzm, tzm_MA,
+ ta, te, teo, teo_KE, teo_UG, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, twq_NE
+ tzm, tzm_MA
// U
- ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_AF, uz_Arab,
- uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, uz_UZ,
+ ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ
// V
- vai, vai_Latn, vai_Latn_LR, vai_LR, vai_Vaii, vai_Vaii_LR, vi,
- vi_VN, vun, vun_TZ,
+ vai, vai_LR, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vun, vun_TZ
// W
- wae, wae_CH, wo, wo_SN,
+ wae, wae_CH, wo
// X
- xh, xh_ZA, xog, xog_UG,
+ xh, xog, xog_UG
// Y
- yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, yue, yue_CN, yue_HK,
- yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK,
+ yav, yav_CM, yi, yo, yue, yue_CN, yue_HK, yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK
// Z
- zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO,
- zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zh_CN,
- zh_HK, zh_MO, zh_SG, zh_TW, zu, zu_ZA
+ zgh, zgh_MA, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
</localeIds>
+ <!-- The following elements configure directories in which a subset of the available
+ locales IDs should be generated. Unlike the main <localeId> element, these
+ filters must specify all locale IDs in full (but since they mostly select base
+ languages, this isn't a big deal). -->
<!-- TODO: Explain why these special cases are needed/different. -->
- <localeIds dirs="coll">
+
+ <directoryFilter dir="coll">
root,
// A-B
// U-Z
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
- </localeIds>
+ </directoryFilter>
- <localeIds dirs="rbnf">
+ <directoryFilter dir="rbnf">
root,
// A-E
// Q-Z
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
- </localeIds>
+ </directoryFilter>
- <localeIds dirs="brkitr">
+ <directoryFilter dir="brkitr">
root,
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
- </localeIds>
+ </directoryFilter>
+
+ <!-- The following elements configure some very special case locale alias behaviour,
+ mainly to support situations where the natural alias relationship is not wanted
+ for a particular type of data. -->
+
+ <!-- GLOBAL ALIASES -->
+
+ <!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
+ (e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
+ relationship. Unlike deprecated languages for which an alias can be inferred from
+ the "languageAlias" element, there's no way in CLDR to represent the fact that we
+ want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
+
+ This alias is the first example of potentially many cases where ICU needs to
+ generate an alias in order to affect "sideways inheritence" for spoken languages,
+ and at some stage it should be supported properly in the CLDR data. -->
+ <forcedAlias source="ars" target="ar_SA"/>
+
+ <!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
+ <forcedAlias source="no_NO_NY" target="nn_NO"/>
+
+ <!-- PER-DIRECTORY ALIASES (these are really special cases) -->
+
+ <!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally). -->
+ <!-- TODO: Find out and document this properly. -->
+ <forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
+
+ <!-- This alias is to avoid needing to copy and maintain the same "zh" data for "yue".
+ The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
+ "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
+ rewriting the base language.
+
+ This is similar to the case for "ars"/"ar_SA" but it is not done globally, since
+ CLDR data does exist for "yue" and "yue_Hans" which is NOT the same as "zh_Hant"
+ and "zh_Hans"/"zh". This mapping is a bit more of a "hack" for the purposes of
+ reducing data duplication in ICU. -->
+ <forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
+ <forcedAlias dir="coll" source="yue" target="zh_Hant"/>
+
+ <!-- It is not at all clear why this is being done. It's certainly not exactly the same
+ as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
+ data than "yue", so this alias is not just rewriting the base language. -->
+ <!-- TODO: Find out and document this properly. -->
+ <forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
</convert>
</target>
-</project>
\ No newline at end of file
+</project>
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
-import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
+import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.ImmutableTable;
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
*/
public final class IcuConverterConfig implements LdmlConverterConfig {
-
- private static final Optional<Path> DEFAULT_CLDR_DIR =
- Optional.ofNullable(System.getProperty("CLDR_DIR", null))
- .map(d -> Paths.get(d).toAbsolutePath());
-
private static final Optional<Path> DEFAULT_ICU_DIR =
Optional.ofNullable(System.getProperty("ICU_DIR", null))
.map(d -> Paths.get(d).toAbsolutePath());
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
@SuppressWarnings("UnusedReturnValue")
public static final class Builder {
- private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
private Path outputDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
private Path specialsDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
- private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
+ private CldrDraftStatus minimumDraftStatus = CldrDraftStatus.CONTRIBUTED;
private boolean emitReport = false;
private final SetMultimap<IcuLocaleDir, String> localeIdsMap = TreeMultimap.create();
private final Table<IcuLocaleDir, String, String> forcedAliases = TreeBasedTable.create();
- /**
- * Sets the CLDR base directory from which to load all CLDR data. This is optional if the
- * {@code CLDR_DIR} environment variable is set, which will be used instead.
- */
- public Builder setCldrDir(Path cldrDir) {
- this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
- return this;
- }
-
/**
* Sets the output directory in which the ICU data directories and files will go. This is
* optional if the {@code ICU_DIR} system property is set, which will be used to generate
return this;
}
- /**
- * Sets the minimum draft status for CLDR data to be converted (paths below this status are
- * ignored during conversion). This is optional and defaults to {@link
- * CldrDraftStatus#CONTRIBUTED}.
- */
- public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
- this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
- return this;
+ public void setMinimumDraftStatus(CldrDraftStatus minimumDraftStatus) {
+ this.minimumDraftStatus = checkNotNull(minimumDraftStatus);
}
public Builder setEmitReport(boolean emitReport) {
}
}
- private final Path cldrDir;
private final Path outputDir;
private final Path specialsDir;
private final ImmutableSet<OutputType> outputTypes;
- private final CldrDraftStatus minimalDraftStatus;
+ private final CldrDraftStatus minimumDraftStatus;
private final boolean emitReport;
+ private final ImmutableSet<String> allLocaleIds;
private final ImmutableSetMultimap<IcuLocaleDir, String> localeIdsMap;
private final ImmutableTable<IcuLocaleDir, String, String> forcedAliases;
private IcuConverterConfig(Builder builder) {
- this.cldrDir = checkNotNull(builder.cldrDir,
- "must set a CLDR directory, or the CLDR_DIR system property");
- if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
- System.err.format(
- "Warning: Specified CLDR base directory does not appear to match the"
- + " directory inferred by the 'CLDR_DIR' system property.\n"
- + "Specified: %s\n"
- + "Inferred: %s\n",
- this.cldrDir, DEFAULT_CLDR_DIR.get());
- }
this.outputDir = checkNotNull(builder.outputDir);
checkArgument(!Files.isRegularFile(outputDir),
"specified output directory if not a directory: %s", outputDir);
checkArgument(!this.outputTypes.isEmpty(),
"must specify at least one output type to be generated (possible values are: %s)",
Arrays.asList(OutputType.values()));
- this.minimalDraftStatus = builder.minimalDraftStatus;
+ this.minimumDraftStatus = checkNotNull(builder.minimumDraftStatus);
this.emitReport = builder.emitReport;
+ // getAllLocaleIds() returns the union of all the specified IDs in the map.
+ this.allLocaleIds = ImmutableSet.copyOf(builder.localeIdsMap.values());
this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap);
this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases);
}
return new Builder();
}
- @Override
- public Path getCldrDirectory() {
- return cldrDir;
- }
-
@Override
public Path getOutputDir() {
return outputDir;
}
@Override
- public CldrDraftStatus getMinimumDraftStatus() {
- return minimalDraftStatus;
+ public Path getSpecialsDir() {
+ return specialsDir;
}
@Override
- public Path getSpecialsDir() {
- return specialsDir;
+ public CldrDraftStatus getMinimumDraftStatus() {
+ return minimumDraftStatus;
}
@Override
}
@Override
- public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
+ public ImmutableMap<String, String> getForcedAliases(IcuLocaleDir dir) {
return forcedAliases.row(dir);
}
+ @Override public ImmutableSet<String> getAllLocaleIds() {
+ return allLocaleIds;
+ }
+
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
return localeIdsMap.get(dir);
}
// TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
private static void writeHeaderAndComments(
PrintWriter out, List<String> header, List<String> comments) {
- header.forEach(out::println);
+
+ header.forEach(s -> out.println("// " + s));
if (!comments.isEmpty()) {
// TODO: Don't use /* */ block quotes, just use inline // quotes.
out.println(
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Maps;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
private static final PathMatcher WINDOWS_ZONES_PATHS =
supplementalMatcher("windowsZones");
- // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
- // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
- // locale. However CLDR cannot represent this currently because calendar defaults are in
- // supplemental data (rather than locale data) and are keyed only on territory.
- private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
- ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
-
- // Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
- // structurally valid locale ID. This is injected manually when creating the alias map.
- // This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
- // has been no need for that.
- // TODO: Get "ars" into CLDR and remove this hack.
- private static final Map<String, String> PHANTOM_ALIASES =
- ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
-
private static PathMatcher supplementalMatcher(String... spec) {
checkArgument(spec.length > 0, "must supply at least one matcher spec");
if (spec.length == 1) {
}
/** Converts CLDR data according to the given configuration. */
- public static void convert(LdmlConverterConfig config) {
- CldrDataSupplier src = CldrDataSupplier
- .forCldrFilesIn(config.getCldrDirectory())
- .withDraftStatusAtLeast(config.getMinimumDraftStatus());
- new LdmlConverter(config, src).convertAll(config);
+ public static void convert(
+ CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) {
+ new LdmlConverter(src, supplementalData, config).convertAll();
}
- // The configuration controlling conversion behaviour.
- private final LdmlConverterConfig config;
// The supplier for all data to be converted.
private final CldrDataSupplier src;
- // The set of available locale IDs.
- // TODO: Make available IDs include specials files (or fail if specials are not available).
- private final ImmutableSet<String> availableIds;
// Supplemental data available to mappers if needed.
private final SupplementalData supplementalData;
+ // The configuration controlling conversion behaviour.
+ private final LdmlConverterConfig config;
+ // The set of expanded target locale IDs.
+ // TODO: Make available IDs include specials files (or fail if specials are not available).
+ private final ImmutableSet<String> availableIds;
// Transformer for locale data.
private final PathValueTransformer localeTransformer;
// Transformer for supplemental data.
private final PathValueTransformer supplementalTransformer;
- // Header string to go into every ICU data file.
- private final ImmutableList<String> icuFileHeader;
+ // Header string to go into every ICU data and transliteration rule file (comment prefixes
+ // are not present and must be added by the code writing the file).
+ private final ImmutableList<String> fileHeader;
- private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
- this.config = checkNotNull(config);
+ private LdmlConverter(
+ CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) {
this.src = checkNotNull(src);
- this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
- // Sort the set of available locale IDs but add "root" at the front. This is the
- // set of non-alias locale IDs to be processed.
- Set<String> localeIds = new LinkedHashSet<>();
- localeIds.add("root");
- localeIds.addAll(
- Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
- localeIds.addAll(PHANTOM_LOCALE_IDS);
- this.availableIds = ImmutableSet.copyOf(localeIds);
-
+ this.supplementalData = checkNotNull(supplementalData);
+ this.config = checkNotNull(config);
+ this.availableIds = ImmutableSet.copyOf(
+ Sets.intersection(supplementalData.getAvailableLocaleIds(), config.getAllLocaleIds()));
// Load the remaining path value transformers.
this.supplementalTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
this.localeTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
- this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
+ this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
}
- private void convertAll(LdmlConverterConfig config) {
+ private void convertAll() {
ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
for (OutputType t : config.getOutputTypes()) {
groupByType.put(t.getCldrType(), t);
SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
Path baseDir = config.getOutputDir();
- for (String id : config.getTargetLocaleIds(LOCALES)) {
+ for (String id : config.getAllLocaleIds()) {
// Skip "target" IDs that are aliases (they are handled later).
if (!availableIds.contains(id)) {
continue;
// and must be manually mapped (e.g. legacy locale IDs which don't even parse).
// 4: It is a "super special" forced alias, which might replace existing aliases in
// some output directories.
+
+ // Even forced aliases only apply if they are in the set of locale IDs for the directory.
+ Map<String, String> forcedAliases =
+ Maps.filterKeys(config.getForcedAliases(dir), localeIds::contains);
+
Map<String, String> aliasMap = new LinkedHashMap<>();
for (String id : localeIds) {
- if (PHANTOM_ALIASES.keySet().contains(id)) {
- checkArgument(!availableIds.contains(id),
- "phantom aliases should never be otherwise supported: %s\n"
- + "(maybe the phantom alias can now be removed?)", id);
- aliasMap.put(id, PHANTOM_ALIASES.get(id));
+ if (forcedAliases.keySet().contains(id)) {
+ // Forced aliases will be added later and don't need to be processed here. This
+ // is especially necessary if the ID is not structurally valid (e.g. "no_NO_NY")
+ // since that cannot be processed by the code below.
continue;
}
String canonicalId = supplementalData.replaceDeprecatedTags(id);
// Important that we overwrite entries which might already exist here, since we might have
// already calculated a "natural" alias for something that we want to force (and we should
// replace the existing target, since that affects how we determine empty files later).
- aliasMap.putAll(config.getForcedAliases(dir));
+ aliasMap.putAll(forcedAliases);
return aliasMap;
}
private void processTransforms() {
Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
- write(TransformsMapper.process(src, transformDir), transformDir);
+ write(TransformsMapper.process(src, transformDir, fileHeader), transformDir);
}
private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
private void write(IcuData icuData, Path dir) {
createDirectory(dir);
- IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
+ IcuTextWriter.writeToFile(icuData, dir, fileHeader);
}
private Path createDirectory(Path dir) {
*/
Set<OutputType> getOutputTypes();
- /** Returns the root directory in which the CLDR release is located. */
- Path getCldrDirectory();
-
/**
* Returns an additional "specials" directory containing additional ICU specific XML
* files depending on the given output type. This is where the converter finds any XML
CldrDraftStatus getMinimumDraftStatus();
/**
- * Returns the set of locale IDs to be processed for the given directory.
+ * Returns the complete set of locale IDs which should be considered for processing for this
+ * configuration.
*
- * <p>This set can contain IDs which have noICU data associated with them if they are
- * suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
+ * <p>Note that this set can contain IDs which have no CLDR data associated with them if they
+ * are suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
* exist).
*/
+ Set<String> getAllLocaleIds();
+
+ /**
+ * Returns the set of locale IDs to be processed for the given directory. This set must always
+ * be a subset of {@link #getAllLocaleIds()}.
+ */
Set<String> getTargetLocaleIds(IcuLocaleDir dir);
/**
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
+import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.unicode.cldr.api.AttributeKey;
-import org.unicode.cldr.api.CldrData;
+import org.unicode.cldr.api.CldrDataSupplier;
+import org.unicode.cldr.api.CldrDataType;
import com.google.common.base.Ascii;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableTable;
+import com.google.common.collect.Sets;
import com.google.common.collect.Table;
/**
*/
// TODO: This should be moved into the API and leverage some of the existing utility functions.
public final class SupplementalData {
+ // Special IDs which are not supported via CLDR, but for which synthetic data is injected.
+ // The "TRADITIONAL" variants are here because their calendar differs from the non-variant
+ // locale. However CLDR cannot represent this currently because calendar defaults are in
+ // supplemental data (rather than locale data) and are keyed only on territory.
+ private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
+ ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
+
private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
private static final PathMatcher ALIAS =
}
/**
- * Creates a supplemental data API instance from the given CLDR data.
+ * Creates a supplemental data API instance from the given CLDR data supplier.
*
- * @param supplementalData the raw CLDR supplemental data instance.
+ * @param src the CLDR data supplier.
* @return the supplemental data API.
*/
- public static SupplementalData create(CldrData supplementalData) {
+ public static SupplementalData create(CldrDataSupplier src) {
Table<Alias, String, String> aliasTable = HashBasedTable.create();
Map<String, String> parentLocaleMap = new HashMap<>();
Map<String, String> defaultCalendarMap = new HashMap<>();
Map<String, String> likelySubtagMap = new HashMap<>();
- supplementalData.accept(
+ src.getDataForType(CldrDataType.SUPPLEMENTAL).accept(
ARBITRARY,
v -> {
if (ALIAS.matches(v.getPath())) {
}
});
- // WARNING: The original mapper code determines the full set of deprecated territories and
- // then removes the following hard-coded list without any explanation as to why. While this
- // is presumably to "undeprecate" them for the purposes of the locale processing, there's
- // no explanation of where this list comes from, and thus no way to maintain it.
- //
- // asList("062", "172", "200", "830", "AN", "CS", "QU")
- // .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
- // TODO: Understand and document what on Earth this is all about or delete this comment.
-
+ Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS);
return new SupplementalData(
- aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
+ availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
}
// A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
}
}
+ private final ImmutableSet<String> availableIds;
private final ImmutableTable<Alias, String, String> aliasTable;
private final ImmutableMap<String, String> parentLocaleMap;
private final ImmutableMap<String, String> defaultCalendarMap;
private final ImmutableMap<String, String> likelySubtagMap;
private SupplementalData(
+ Set<String> availableIds,
Table<Alias, String, String> aliasTable,
Map<String, String> parentLocaleMap,
Map<String, String> defaultCalendarMap,
Map<String, String> likelySubtagMap) {
+
+ this.availableIds = ImmutableSet.copyOf(availableIds);
this.aliasTable = ImmutableTable.copyOf(aliasTable);
this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
}
+ public ImmutableSet<String> getAvailableLocaleIds() {
+ return availableIds;
+ }
+
/**
* Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
*/
import java.nio.file.Path;
import java.util.Arrays;
+import java.util.Optional;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
+import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.IcuConverterConfig;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter;
+import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
+import org.unicode.icu.tool.cldrtoicu.SupplementalData;
import com.google.common.base.Ascii;
import com.google.common.base.CaseFormat;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
+import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.SetMultimap;
// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
public final class ConvertIcuDataTask extends Task {
private static final CharMatcher LOWER_UNDERSCORE = inRange('a', 'z').or(DIGIT_OR_UNDERSCORE);
private static final CharMatcher VALID_ENUM_CHAR = LOWER_UNDERSCORE.or(UPPER_UNDERSCORE);
+ private Path cldrPath;
+ private CldrDraftStatus minimumDraftStatus;
+ // Set of default locale ID specifiers (wildcard IDs which are expanded).
+ private ImmutableSet<String> localeIdSpec;
+ // Per directory overrides (fully specified locale IDs).
+ private final SetMultimap<IcuLocaleDir, String> perDirectoryIds = HashMultimap.create();
private final IcuConverterConfig.Builder config = IcuConverterConfig.builder();
@SuppressWarnings("unused")
@SuppressWarnings("unused")
public void setCldrDir(Path path) {
- config.setCldrDir(path);
+ this.cldrPath = checkNotNull(path);
}
@SuppressWarnings("unused")
public void setMinimalDraftStatus(String status) {
- config.setMinimalDraftStatus(resolve(CldrDraftStatus.class, status));
+ minimumDraftStatus = resolve(CldrDraftStatus.class, status);
}
@SuppressWarnings("unused")
public void setOutputTypes(String types) {
- config.setOutputTypes(
+ ImmutableList<OutputType> typeList =
LIST_SPLITTER
.splitToList(types).stream()
- .map(s -> resolve(LdmlConverter.OutputType.class, s))
- .collect(toImmutableList()));
+ .map(s -> resolve(OutputType.class, s))
+ .collect(toImmutableList());
+ if (!typeList.isEmpty()) {
+ config.setOutputTypes(typeList);
+ }
}
@SuppressWarnings("unused")
}
public static final class LocaleIds extends Task {
- private ImmutableList<IcuLocaleDir> dirs = ImmutableList.of();
- private ImmutableList<String> ids = ImmutableList.of();
+ private ImmutableSet<String> ids;
@SuppressWarnings("unused")
- public void setDirs(String directories) {
- this.dirs = LIST_SPLITTER.splitToList(directories).stream()
- .map(s -> resolve(IcuLocaleDir.class, s))
- .collect(toImmutableList());
+ public void addText(String localeIds) {
+ this.ids = parseLocaleIds(localeIds);
+ }
+
+ @Override
+ public void init() throws BuildException {
+ checkBuild(!ids.isEmpty(), "Locale IDs must be specified");
+ }
+ }
+
+ public static final class DirectoryFilter extends Task {
+ private IcuLocaleDir dir;
+ private ImmutableSet<String> ids;
+
+ @SuppressWarnings("unused")
+ public void setDir(String directory) {
+ this.dir = resolve(IcuLocaleDir.class, directory);
}
@SuppressWarnings("unused")
public void addText(String localeIds) {
- // Need to filter out '//' style end-of-line comments first (replace with \n to avoid
- // inadvertantly joining two elements.
- localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
- this.ids = ImmutableList.copyOf(LIST_SPLITTER.splitToList(localeIds));
+ this.ids = parseLocaleIds(localeIds);
+ }
+
+ @Override
+ public void init() throws BuildException {
+ checkBuild(dir != null, "Directory must be specified");
+ checkBuild(!ids.isEmpty(), "Locale IDs must be specified");
}
}
public static final class ForcedAlias extends Task {
- private IcuLocaleDir dir;
- private String source;
- private String target;
+ private Optional<IcuLocaleDir> dir = Optional.empty();
+ private String source = "";
+ private String target = "";
@SuppressWarnings("unused")
public void setDir(String directory) {
- this.dir = resolve(IcuLocaleDir.class, directory);
+ this.dir = resolveOpt(IcuLocaleDir.class, directory);
}
@SuppressWarnings("unused")
public void setSource(String source) {
- this.source = checkNotNull(source);
+ this.source = whitespace().trimFrom(source);
}
@SuppressWarnings("unused")
public void setTarget(String target) {
- this.target = checkNotNull(target);
+ this.target = whitespace().trimFrom(target);
+ }
+
+ @Override
+ public void init() throws BuildException {
+ checkBuild(!source.isEmpty(), "Alias source must not be empty");
+ checkBuild(!target.isEmpty(), "Alias target must not be empty");
}
}
@SuppressWarnings("unused")
public void addConfiguredLocaleIds(LocaleIds localeIds) {
- localeIds.dirs.forEach(d -> config.addLocaleIds(d, localeIds.ids));
+ checkBuild(this.localeIdSpec == null, "Cannot add more that one <localeIds> element");
+ this.localeIdSpec = localeIds.ids;
+ }
+
+ @SuppressWarnings("unused")
+ public void addConfiguredDirectoryFilter(DirectoryFilter filter) {
+ perDirectoryIds.putAll(filter.dir, filter.ids);
}
@SuppressWarnings("unused")
public void addConfiguredForcedAlias(ForcedAlias alias) {
- config.addForcedAlias(alias.dir, alias.source, alias.target);
+ if (alias.dir.isPresent()) {
+ config.addForcedAlias(alias.dir.get(), alias.source, alias.target);
+ } else {
+ for (IcuLocaleDir dir : IcuLocaleDir.values()) {
+ config.addForcedAlias(dir, alias.source, alias.target);
+ }
+ }
}
@SuppressWarnings("unused")
public void execute() throws BuildException {
- LdmlConverter.convert(config.build());
+ CldrDataSupplier src =
+ CldrDataSupplier.forCldrFilesIn(cldrPath).withDraftStatusAtLeast(minimumDraftStatus);
+ SupplementalData supplementalData = SupplementalData.create(src);
+ ImmutableSet<String> defaultTargetIds =
+ LocaleIdResolver.expandTargetIds(this.localeIdSpec, supplementalData);
+ for (IcuLocaleDir dir : IcuLocaleDir.values()) {
+ config.addLocaleIds(dir, perDirectoryIds.asMap().getOrDefault(dir, defaultTargetIds));
+ }
+ config.setMinimumDraftStatus(minimumDraftStatus);
+ LdmlConverter.convert(src, supplementalData, config.build());
+ }
+
+ private static void checkBuild(boolean condition, String message) {
+ if (!condition) {
+ throw new BuildException(message);
+ }
+ }
+
+ private static ImmutableSet<String> parseLocaleIds(String localeIds) {
+ // Need to filter out '//' style end-of-line comments first (replace with \n to avoid
+ // inadvertantly joining two elements.
+ localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
+ return ImmutableSet.copyOf(LIST_SPLITTER.splitToList(localeIds));
+ }
+
+ private static <T extends Enum<T>> Optional<T> resolveOpt(Class<T> enumClass, String name) {
+ return !name.isEmpty() ? Optional.of(resolve(enumClass, name)) : Optional.empty();
}
private static <T extends Enum<T>> T resolve(Class<T> enumClass, String name) {
--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+package org.unicode.icu.tool.cldrtoicu.ant;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.collect.ImmutableSet.toImmutableSet;
+
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+import org.unicode.icu.tool.cldrtoicu.SupplementalData;
+
+import com.google.common.base.Ascii;
+import com.google.common.collect.ImmutableListMultimap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Multimaps;
+import com.google.common.collect.Sets;
+
+/** Helper class to reslove ID configuration. */
+final class LocaleIdResolver {
+ /** Returns the expanded set of target locale IDs based on the given ID specifications. */
+ public static ImmutableSet<String> expandTargetIds(
+ Set<String> idSpecs, SupplementalData supplementalData) {
+ return new LocaleIdResolver(supplementalData).resolve(idSpecs);
+ }
+
+ private final SupplementalData supplementalData;
+
+ private LocaleIdResolver(SupplementalData supplementalData) {
+ this.supplementalData = checkNotNull(supplementalData);
+ }
+
+ // ---- Code below here is to expand the incoming set of locale IDs ----
+
+ private static final Pattern WILDCARD_LOCALE = Pattern.compile("[a-z]{2}(?:_[A-Z][a-z]{3})?");
+
+ private ImmutableSet<String> resolve(Set<String> idSpecs) {
+ ImmutableSet<String> allAvailableIds = supplementalData.getAvailableLocaleIds();
+ // Get the minimized wildcard set, converting things like "en_Latn" --> "en".
+ ImmutableSet<String> wildcardIds = idSpecs.stream()
+ .filter(supplementalData.getAvailableLocaleIds()::contains)
+ .filter(id -> WILDCARD_LOCALE.matcher(id).matches())
+ .map(this::removeDefaultScript)
+ .collect(toImmutableSet());
+
+ // Get the set of IDs which are implied by the wildcard IDs.
+ Set<String> targetIds = new TreeSet<>();
+ allAvailableIds.forEach(id -> addWildcardMatches(id, wildcardIds::contains, targetIds));
+
+ // Get the IDs which don't need to be in the config (because they are implied).
+ Set<String> redundant = Sets.intersection(idSpecs, targetIds);
+ if (!redundant.isEmpty()) {
+ System.err.println("Configuration lists redundant locale IDs");
+ System.err.println("The following IDs should be removed from the configuration:");
+ Iterables.partition(redundant, 16)
+ .forEach(ids -> System.err.println(String.join(", ", ids)));
+
+ // Note that the minimal configuration includes aliases.
+ Set<String> minimalConfigIds = new TreeSet<>(Sets.difference(idSpecs, targetIds));
+ minimalConfigIds.remove("root");
+ ImmutableListMultimap<Character, String> idsByFirstChar =
+ Multimaps.index(minimalConfigIds, s -> s.charAt(0));
+
+ System.err.println("Canonical ID list is:");
+ for (char c: idsByFirstChar.keySet()) {
+ System.err.println(" // " + Ascii.toUpperCase(c));
+ Iterables.partition(idsByFirstChar.get(c), 16)
+ .forEach(ids -> System.err.println(" " + String.join(", ", ids)));
+ System.err.println();
+ }
+ System.err.flush();
+ throw new IllegalStateException("Non-canonical configuration");
+ }
+
+ // We return the set of IDs made up of:
+ // 1: The original IDs specified by the configuration (and any parent IDs).
+ // 2: IDs expanded from wildcard IDs (e.g. "en_Latn_GB" & "en_Latn" from "en").
+ // (this is what's already in targetIds).
+ // 3: The "root" ID.
+ idSpecs.forEach(id -> addRecursively(id, targetIds));
+ return ImmutableSet.<String>builder().add("root").addAll(targetIds).build();
+ }
+
+ // E.g. "xx_Fooo" --> "xx" --> "xx_Baar_YY" ==> "xx_Fooo"
+ // E.g. "xx_Fooo" --> "xx" --> "xx_Fooo_YY" ==> "xx"
+ private String removeDefaultScript(String id) {
+ if (id.contains("_")) {
+ String lang = id.substring(0, 2);
+ String maxId = supplementalData.maximize(lang)
+ .orElseThrow(
+ () -> new IllegalStateException("cannot maximize language subtag: " + lang));
+ if (maxId.startsWith(id)) {
+ return lang;
+ }
+ }
+ return id;
+ }
+
+ private void addRecursively(String id, Set<String> dst) {
+ while (!id.equals("root") && dst.add(id)) {
+ id = supplementalData.getParent(id);
+ }
+ }
+
+ private boolean addWildcardMatches(
+ String id, Predicate<String> isWildcard, Set<String> dst) {
+ if (id.equals("root")) {
+ return false;
+ }
+ String parentId = supplementalData.getParent(id);
+ if (isWildcard.test(parentId) || addWildcardMatches(parentId, isWildcard, dst)) {
+ dst.add(id);
+ return true;
+ }
+ return false;
+ }
+}
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import org.unicode.icu.tool.cldrtoicu.RbValue;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
import com.ibm.icu.text.Transliterator;
/**
* @param ruleFileOutputDir the directory into which transliteration rule files will be written.
* @return the IcuData instance to be written to a file.
*/
- public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
+ public static IcuData process(
+ CldrDataSupplier src, Path ruleFileOutputDir, List<String> header) {
+
Function<Path, PrintWriter> fileWriterFn = p -> {
Path file = ruleFileOutputDir.resolve(p);
try {
}
};
CldrData cldrData = src.getDataForType(SUPPLEMENTAL);
- return process(cldrData, fileWriterFn);
+ return process(cldrData, fileWriterFn, header);
}
@VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
- static IcuData process(CldrData cldrData, Function<Path, PrintWriter> fileWriterFn) {
- RuleVisitor visitor = new RuleVisitor(fileWriterFn);
+ static IcuData process(
+ CldrData cldrData, Function<Path, PrintWriter> fileWriterFn, List<String> header) {
+
+ RuleVisitor visitor = new RuleVisitor(fileWriterFn, header);
cldrData.accept(DTD, visitor);
addSpecialCaseValues(visitor.icuData);
return visitor.icuData;
private static class RuleVisitor implements ValueVisitor {
private final IcuData icuData = new IcuData("root", false);
private final Function<Path, PrintWriter> outFn;
+ private final ImmutableList<String> header;
- RuleVisitor(Function<Path, PrintWriter> outFn) {
+ RuleVisitor(Function<Path, PrintWriter> outFn, List<String> header) {
this.outFn = checkNotNull(outFn);
+ this.header = ImmutableList.copyOf(header);
icuData.setFileComment("File: root.txt");
}
private void writeDataFile(String filename, CldrValue value) {
try (PrintWriter out = outFn.apply(Paths.get(filename))) {
- out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
- out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
+ out.print("\uFEFF");
+ header.forEach(s -> out.println("# " + s));
out.println("#");
out.println("# File: " + filename);
out.println("# Generated from CLDR");
-// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
\ No newline at end of file
+© 2016 and later: Unicode, Inc. and others.
+License & terms of use: http://www.unicode.org/copyright.html#License
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth.assertWithMessage;
import static com.google.common.truth.Truth8.assertThat;
-import static java.util.Arrays.asList;
-import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import static org.unicode.cldr.api.CldrValue.parseValue;
import java.nio.file.Path;
import org.unicode.cldr.util.LanguageTagCanonicalizer;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.SupplementalDataInfo;
+import org.unicode.icu.tool.cldrtoicu.testing.FakeDataSupplier;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;
@BeforeClass
public static void loadRegressionData() {
Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
- regressionData = SupplementalData
- .create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
+ regressionData = SupplementalData.create(CldrDataSupplier.forCldrFilesIn(cldrRoot));
SupplementalDataInfo sdi =
SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
likelySubtags = new LikelySubtags(sdi);
}
private static SupplementalData fakeSupplementalData(CldrValue... values) {
- return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
+ return SupplementalData.create(new FakeDataSupplier().addSupplementalData(values));
}
}
import java.util.Map;
import java.util.TreeMap;
import java.util.function.Function;
-import java.util.stream.Stream;
import org.junit.Test;
import org.junit.runner.RunWith;
@RunWith(JUnit4.class)
public class TransformsMapperTest {
- private static final ImmutableList<String> FILE_HEADER = ImmutableList.of(
- "\uFEFF# © 2016 and later: Unicode, Inc. and others.",
- "# License & terms of use: http://www.unicode.org/copyright.html#License",
- "#");
+ private static final ImmutableList<String> HEADER_LINES = ImmutableList.of(
+ "First header line",
+ "Second header line");
+
+ private static final String FILE_HEADER =
+ "\uFEFF# First header line\n"
+ + "# Second header line\n"
+ + "#\n";
private static final int DEFAULT_PATH_COUNT = 7;
@Test
public void testDefaultContent() {
Map<String, String> fileMap = new TreeMap<>();
- IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap));
+ IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap), HEADER_LINES);
assertThat(fileMap).isEmpty();
cldrData(oneWay("foo", "bar", FORWARD, null, INTERNAL, "first second third", ++idx));
Map<String, String> fileMap = new TreeMap<>();
- IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
+ IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5);
assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/first/alias", "foo-bar");
cldrData(oneWay("foo", "bar", BACKWARD, "variant", EXTERNAL, "one two three", ++idx));
Map<String, String> fileMap = new TreeMap<>();
- IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
+ IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5);
assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/one/alias", "bar-foo/variant");
both("foo", "bar", null, INTERNAL, "forward-alias", "backward-alias", ++idx));
Map<String, String> fileMap = new TreeMap<>();
- IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
+ IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
// 3 for each direction.
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 6);
private String headerPlusLines(String... lines) {
// For now the files always contain a blank line at the end (to match legacy behaviour) but
// this can, and probably should be changed.
- return Stream
- .concat(FILE_HEADER.stream(), Arrays.stream(lines))
- .collect(joining("\n", "", "\n\n"));
+ return Arrays.stream(lines).collect(joining("\n", FILE_HEADER, "\n\n"));
}
private static CldrData cldrData(CldrValue... values) {