From 498abf69611ca32ed41c5811f82ab7dfcb244757 Mon Sep 17 00:00:00 2001 From: Rich Gillam <62772518+richgillam@users.noreply.github.com> Date: Thu, 4 Aug 2022 16:01:04 -0700 Subject: [PATCH] ICU-21125 Improvements to resource fallback: - Added code to use the parentLocales data in supplementalData.xml to determine the "parent locale ID" to use when the requested resource bundle is not present (ICU-21126). - Added code to change the parent-chain search path to handle the script better (ICU-21125; algorithm was described in CLDR-15265): - The base search patch is now ll_Ssss_RR -> ll_RR -> ll_Ssss -> ll -> root - If the requested script is not the default script for the requested language and region, we automatically avoid fallbacks that will implicitly change the script. - Added new code to the CLDR-to-ICU data generation tool to generate source code, and used it to generate the lookup tables for the new resource-fallback logic (we can't use the existing resource files for this, since that would involve opening a resource bundle while trying to open another resource bundle). The data-generation stuff is intended to be generic enough to allow for us to generate more static data tables in the future. - Commented out a few collator tests, and changed one resource bundle test, because they're incompatible with the new fallback logic (specifically, the default-script logic). --- icu4c/source/common/localefallback_data.h | 631 ++++++++++++++++++ icu4c/source/common/uresbund.cpp | 254 +++++-- icu4c/source/test/cintltst/crestst.c | 61 ++ .../com/ibm/icu/impl/ICUResourceBundle.java | 117 +++- .../com/ibm/icu/impl/LocaleFallbackData.java | 574 ++++++++++++++++ .../dev/test/util/ULocaleCollationTest.java | 40 +- .../dev/test/util/ICUResourceBundleTest.java | 35 + tools/cldr/cldr-to-icu/README.txt | 8 +- tools/cldr/cldr-to-icu/build-icu-data.xml | 33 +- .../icu/tool/cldrtoicu/CodeGenerator.java | 10 + .../tool/cldrtoicu/ant/GenerateCodeTask.java | 92 +++ .../ResourceFallbackCodeGenerator.java | 194 ++++++ 12 files changed, 1970 insertions(+), 79 deletions(-) create mode 100644 icu4c/source/common/localefallback_data.h create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleFallbackData.java create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/CodeGenerator.java create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/GenerateCodeTask.java create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/generator/ResourceFallbackCodeGenerator.java diff --git a/icu4c/source/common/localefallback_data.h b/icu4c/source/common/localefallback_data.h new file mode 100644 index 00000000000..18cbec78c31 --- /dev/null +++ b/icu4c/source/common/localefallback_data.h @@ -0,0 +1,631 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// Internal static data tables used by uresbund.cpp +// WARNING: This file is mechanically generated by the CLDR-to-ICU tool +// (see tools/cldr/cldr-to-icu/src/main/java/org/unicode/tool/cldrtoicu/generator/ResourcFallbackCodeGenerator.java). +// DO NOT HAND EDIT!!! + +#ifdef INCLUDED_FROM_URESBUND_CPP + +//====================================================================== +// Default script table +const char scriptCodeChars[] = + "Aghb\0Ahom\0Arab\0Armi\0Armn\0Avst\0Bamu\0Bass\0Beng\0Brah\0Cakm\0" + "Cans\0Cari\0Cham\0Cher\0Chrs\0Copt\0Cprt\0Cyrl\0Deva\0Egyp\0Ethi\0" + "Geor\0Gong\0Gonm\0Goth\0Grek\0Gujr\0Guru\0Hans\0Hant\0Hebr\0Hluw\0" + "Hmnp\0Ital\0Jpan\0Kali\0Kana\0Kawi\0Khar\0Khmr\0Kits\0Knda\0Kore\0" + "Lana\0Laoo\0Lepc\0Lina\0Lisu\0Lyci\0Lydi\0Mand\0Mani\0Medf\0Merc\0" + "Mlym\0Mong\0Mroo\0Mymr\0Narb\0Nkoo\0Nshu\0Ogam\0Olck\0Orkh\0Orya\0" + "Osge\0Ougr\0Pauc\0Phli\0Phnx\0Plrd\0Prti\0Rohg\0Runr\0Samr\0Sarb\0" + "Saur\0Sgnw\0Sinh\0Sogd\0Sora\0Soyo\0Syrc\0Tale\0Talu\0Taml\0Tang\0" + "Tavt\0Telu\0Tfng\0Thaa\0Thai\0Tibt\0Tnsa\0Toto\0Ugar\0Vaii\0Wcho\0" + "Xpeo\0Xsux\0Yiii\0"; + +const char dsLocaleIDChars[] = + "ab\0abq\0adp\0ady\0ae\0aeb\0aho\0akk\0alt\0am\0apc\0apd\0ar\0" + "arc\0arq\0ars\0ary\0arz\0as\0ase\0av\0avl\0awa\0az_IQ\0az_IR\0" + "az_RU\0ba\0bal\0bap\0bax\0bcq\0be\0bej\0bfq\0bft\0bfy\0bg\0bgc\0" + "bgn\0bgx\0bhb\0bhi\0bho\0bji\0bjj\0blt\0bn\0bo\0bpy\0bqi\0bra\0" + "brh\0brx\0bsq\0bst\0btv\0bua\0byn\0ccp\0ce\0chm\0chr\0cja\0cjm\0" + "ckb\0cmg\0cop\0cr\0crh\0crk\0crl\0csw\0ctd\0cu\0cv\0dar\0dcc\0" + "dgl\0dmf\0doi\0drh\0drs\0dty\0dv\0dz\0egy\0eky\0el\0esg\0ett\0" + "fa\0fia\0fub\0gan\0gbm\0gbz\0gez\0ggn\0gjk\0gju\0glk\0gmv\0gof\0" + "gom\0gon\0got\0grc\0grt\0gu\0gvr\0gwc\0gwt\0ha_CM\0ha_SD\0hak\0" + "haz\0hdy\0he\0hi\0hlu\0hmd\0hnd\0hne\0hnj\0hno\0hoc\0hoj\0hsn\0" + "hy\0ii\0inh\0iu\0iw\0ja\0ji\0jml\0ka\0kaa\0kaw\0kbd\0kby\0kdt\0" + "kfr\0kfy\0khb\0khn\0kht\0khw\0kjg\0kk\0kk_AF\0kk_CN\0kk_IR\0kk_MN\0" + "km\0kn\0ko\0koi\0kok\0kqy\0krc\0kru\0ks\0ktb\0ku_LB\0kum\0kv\0" + "kvx\0kxc\0kxl\0kxm\0kxp\0ky\0ky_CN\0kzh\0lab\0lad\0lah\0lbe\0" + "lcp\0lep\0lez\0lif\0lis\0lki\0lmn\0lo\0lrc\0luz\0lwl\0lzh\0mag\0" + "mai\0man_GN\0mde\0mdf\0mdx\0mfa\0mgp\0mk\0mki\0ml\0mn\0mn_CN\0" + "mni\0mnw\0mr\0mrd\0mrj\0mro\0ms_CC\0mtr\0mvy\0mwr\0mww\0my\0mym\0" + "myv\0myz\0mzn\0nan\0ne\0new\0nnp\0nod\0noe\0non\0nqo\0nsk\0nst\0" + "oj\0ojs\0or\0oru\0os\0osa\0ota\0otk\0oui\0pa\0pa_PK\0pal\0peo\0" + "phl\0phn\0pka\0pnt\0ppa\0pra\0prd\0ps\0raj\0rhg\0rif\0rjs\0rkt\0" + "rmt\0ru\0rue\0ryu\0sa\0sah\0sat\0saz\0sck\0scl\0sd\0sd_IN\0sdh\0" + "sga\0sgw\0shi\0shn\0shu\0si\0skr\0smp\0sog\0sou\0sr\0srb\0srx\0" + "swb\0swv\0syl\0syr\0ta\0taj\0tcy\0tdd\0tdg\0tdh\0te\0tg\0tg_PK\0" + "th\0thl\0thq\0thr\0ti\0tig\0tkt\0trw\0tsd\0tsf\0tsj\0tt\0tts\0" + "txg\0txo\0tyv\0udi\0udm\0ug\0ug_KZ\0ug_MN\0uga\0uk\0unr\0unr_NP\0" + "unx\0ur\0uz_AF\0uz_CN\0vai\0wal\0wbq\0wbr\0wni\0wsg\0wtm\0wuu\0" + "xco\0xcr\0xlc\0xld\0xmf\0xmn\0xmr\0xna\0xnr\0xpr\0xsa\0xsr\0yi\0" + "yue\0yue_CN\0zdj\0zgh\0zh\0zh_AU\0zh_BN\0zh_GB\0zh_GF\0zh_HK\0" + "zh_ID\0zh_MO\0zh_PA\0zh_PF\0zh_PH\0zh_SR\0zh_TH\0zh_TW\0zh_US\0" + "zh_VN\0zhx\0zkt\0"; + +const int32_t defaultScriptTable[] = { + 0, 90, // ab -> Cyrl + 3, 90, // abq -> Cyrl + 7, 465, // adp -> Tibt + 11, 90, // ady -> Cyrl + 15, 25, // ae -> Avst + 18, 10, // aeb -> Arab + 22, 5, // aho -> Ahom + 26, 500, // akk -> Xsux + 30, 90, // alt -> Cyrl + 34, 105, // am -> Ethi + 37, 10, // apc -> Arab + 41, 10, // apd -> Arab + 45, 10, // ar -> Arab + 48, 15, // arc -> Armi + 52, 10, // arq -> Arab + 56, 10, // ars -> Arab + 60, 10, // ary -> Arab + 64, 10, // arz -> Arab + 68, 40, // as -> Beng + 71, 390, // ase -> Sgnw + 75, 90, // av -> Cyrl + 78, 10, // avl -> Arab + 82, 95, // awa -> Deva + 86, 10, // az_IQ -> Arab + 92, 10, // az_IR -> Arab + 98, 90, // az_RU -> Cyrl + 104, 90, // ba -> Cyrl + 107, 10, // bal -> Arab + 111, 95, // bap -> Deva + 115, 30, // bax -> Bamu + 119, 105, // bcq -> Ethi + 123, 90, // be -> Cyrl + 126, 10, // bej -> Arab + 130, 430, // bfq -> Taml + 134, 10, // bft -> Arab + 138, 95, // bfy -> Deva + 142, 90, // bg -> Cyrl + 145, 95, // bgc -> Deva + 149, 10, // bgn -> Arab + 153, 130, // bgx -> Grek + 157, 95, // bhb -> Deva + 161, 95, // bhi -> Deva + 165, 95, // bho -> Deva + 169, 105, // bji -> Ethi + 173, 95, // bjj -> Deva + 177, 440, // blt -> Tavt + 181, 40, // bn -> Beng + 184, 465, // bo -> Tibt + 187, 40, // bpy -> Beng + 191, 10, // bqi -> Arab + 195, 95, // bra -> Deva + 199, 10, // brh -> Arab + 203, 95, // brx -> Deva + 207, 35, // bsq -> Bass + 211, 105, // bst -> Ethi + 215, 95, // btv -> Deva + 219, 90, // bua -> Cyrl + 223, 105, // byn -> Ethi + 227, 50, // ccp -> Cakm + 231, 90, // ce -> Cyrl + 234, 90, // chm -> Cyrl + 238, 70, // chr -> Cher + 242, 10, // cja -> Arab + 246, 65, // cjm -> Cham + 250, 10, // ckb -> Arab + 254, 410, // cmg -> Soyo + 258, 80, // cop -> Copt + 262, 55, // cr -> Cans + 265, 90, // crh -> Cyrl + 269, 55, // crk -> Cans + 273, 55, // crl -> Cans + 277, 55, // csw -> Cans + 281, 340, // ctd -> Pauc + 285, 90, // cu -> Cyrl + 288, 90, // cv -> Cyrl + 291, 90, // dar -> Cyrl + 295, 10, // dcc -> Arab + 299, 10, // dgl -> Arab + 303, 265, // dmf -> Medf + 307, 95, // doi -> Deva + 311, 280, // drh -> Mong + 315, 105, // drs -> Ethi + 319, 95, // dty -> Deva + 323, 455, // dv -> Thaa + 326, 465, // dz -> Tibt + 329, 100, // egy -> Egyp + 333, 180, // eky -> Kali + 337, 130, // el -> Grek + 340, 120, // esg -> Gonm + 344, 170, // ett -> Ital + 348, 10, // fa -> Arab + 351, 10, // fia -> Arab + 355, 10, // fub -> Arab + 359, 145, // gan -> Hans + 363, 95, // gbm -> Deva + 367, 10, // gbz -> Arab + 371, 105, // gez -> Ethi + 375, 95, // ggn -> Deva + 379, 10, // gjk -> Arab + 383, 10, // gju -> Arab + 387, 10, // glk -> Arab + 391, 105, // gmv -> Ethi + 395, 105, // gof -> Ethi + 399, 95, // gom -> Deva + 403, 445, // gon -> Telu + 407, 125, // got -> Goth + 411, 85, // grc -> Cprt + 415, 40, // grt -> Beng + 419, 135, // gu -> Gujr + 422, 95, // gvr -> Deva + 426, 10, // gwc -> Arab + 430, 10, // gwt -> Arab + 434, 10, // ha_CM -> Arab + 440, 10, // ha_SD -> Arab + 446, 145, // hak -> Hans + 450, 10, // haz -> Arab + 454, 105, // hdy -> Ethi + 458, 155, // he -> Hebr + 461, 95, // hi -> Deva + 464, 160, // hlu -> Hluw + 468, 355, // hmd -> Plrd + 472, 10, // hnd -> Arab + 476, 95, // hne -> Deva + 480, 165, // hnj -> Hmnp + 484, 10, // hno -> Arab + 488, 95, // hoc -> Deva + 492, 95, // hoj -> Deva + 496, 145, // hsn -> Hans + 500, 20, // hy -> Armn + 503, 505, // ii -> Yiii + 506, 90, // inh -> Cyrl + 510, 55, // iu -> Cans + 513, 155, // iw -> Hebr + 516, 175, // ja -> Jpan + 519, 155, // ji -> Hebr + 522, 95, // jml -> Deva + 526, 110, // ka -> Geor + 529, 90, // kaa -> Cyrl + 533, 190, // kaw -> Kawi + 537, 90, // kbd -> Cyrl + 541, 10, // kby -> Arab + 545, 460, // kdt -> Thai + 549, 95, // kfr -> Deva + 553, 95, // kfy -> Deva + 557, 425, // khb -> Talu + 561, 95, // khn -> Deva + 565, 290, // kht -> Mymr + 569, 10, // khw -> Arab + 573, 225, // kjg -> Laoo + 577, 90, // kk -> Cyrl + 580, 10, // kk_AF -> Arab + 586, 10, // kk_CN -> Arab + 592, 10, // kk_IR -> Arab + 598, 10, // kk_MN -> Arab + 604, 200, // km -> Khmr + 607, 210, // kn -> Knda + 610, 215, // ko -> Kore + 613, 90, // koi -> Cyrl + 617, 95, // kok -> Deva + 621, 105, // kqy -> Ethi + 625, 90, // krc -> Cyrl + 629, 95, // kru -> Deva + 633, 10, // ks -> Arab + 636, 105, // ktb -> Ethi + 640, 10, // ku_LB -> Arab + 646, 90, // kum -> Cyrl + 650, 90, // kv -> Cyrl + 653, 10, // kvx -> Arab + 657, 105, // kxc -> Ethi + 661, 95, // kxl -> Deva + 665, 460, // kxm -> Thai + 669, 10, // kxp -> Arab + 673, 90, // ky -> Cyrl + 676, 10, // ky_CN -> Arab + 682, 10, // kzh -> Arab + 686, 235, // lab -> Lina + 690, 155, // lad -> Hebr + 694, 10, // lah -> Arab + 698, 90, // lbe -> Cyrl + 702, 460, // lcp -> Thai + 706, 230, // lep -> Lepc + 710, 90, // lez -> Cyrl + 714, 95, // lif -> Deva + 718, 240, // lis -> Lisu + 722, 10, // lki -> Arab + 726, 445, // lmn -> Telu + 730, 225, // lo -> Laoo + 733, 10, // lrc -> Arab + 737, 10, // luz -> Arab + 741, 460, // lwl -> Thai + 745, 145, // lzh -> Hans + 749, 95, // mag -> Deva + 753, 95, // mai -> Deva + 757, 300, // man_GN -> Nkoo + 764, 10, // mde -> Arab + 768, 90, // mdf -> Cyrl + 772, 105, // mdx -> Ethi + 776, 10, // mfa -> Arab + 780, 95, // mgp -> Deva + 784, 90, // mk -> Cyrl + 787, 10, // mki -> Arab + 791, 275, // ml -> Mlym + 794, 90, // mn -> Cyrl + 797, 280, // mn_CN -> Mong + 803, 40, // mni -> Beng + 807, 290, // mnw -> Mymr + 811, 95, // mr -> Deva + 814, 95, // mrd -> Deva + 818, 90, // mrj -> Cyrl + 822, 285, // mro -> Mroo + 826, 10, // ms_CC -> Arab + 832, 95, // mtr -> Deva + 836, 10, // mvy -> Arab + 840, 95, // mwr -> Deva + 844, 165, // mww -> Hmnp + 848, 290, // my -> Mymr + 851, 105, // mym -> Ethi + 855, 90, // myv -> Cyrl + 859, 255, // myz -> Mand + 863, 10, // mzn -> Arab + 867, 145, // nan -> Hans + 871, 95, // ne -> Deva + 874, 95, // new -> Deva + 878, 490, // nnp -> Wcho + 882, 220, // nod -> Lana + 886, 95, // noe -> Deva + 890, 370, // non -> Runr + 894, 300, // nqo -> Nkoo + 898, 55, // nsk -> Cans + 902, 470, // nst -> Tnsa + 906, 55, // oj -> Cans + 909, 55, // ojs -> Cans + 913, 325, // or -> Orya + 916, 10, // oru -> Arab + 920, 90, // os -> Cyrl + 923, 330, // osa -> Osge + 927, 10, // ota -> Arab + 931, 320, // otk -> Orkh + 935, 335, // oui -> Ougr + 939, 140, // pa -> Guru + 942, 10, // pa_PK -> Arab + 948, 345, // pal -> Phli + 952, 495, // peo -> Xpeo + 956, 10, // phl -> Arab + 960, 350, // phn -> Phnx + 964, 45, // pka -> Brah + 968, 130, // pnt -> Grek + 972, 95, // ppa -> Deva + 976, 195, // pra -> Khar + 980, 10, // prd -> Arab + 984, 10, // ps -> Arab + 987, 95, // raj -> Deva + 991, 365, // rhg -> Rohg + 995, 450, // rif -> Tfng + 999, 95, // rjs -> Deva + 1003, 40, // rkt -> Beng + 1007, 10, // rmt -> Arab + 1011, 90, // ru -> Cyrl + 1014, 90, // rue -> Cyrl + 1018, 185, // ryu -> Kana + 1022, 95, // sa -> Deva + 1025, 90, // sah -> Cyrl + 1029, 315, // sat -> Olck + 1033, 385, // saz -> Saur + 1037, 95, // sck -> Deva + 1041, 10, // scl -> Arab + 1045, 10, // sd -> Arab + 1048, 95, // sd_IN -> Deva + 1054, 10, // sdh -> Arab + 1058, 310, // sga -> Ogam + 1062, 105, // sgw -> Ethi + 1066, 450, // shi -> Tfng + 1070, 290, // shn -> Mymr + 1074, 10, // shu -> Arab + 1078, 395, // si -> Sinh + 1081, 10, // skr -> Arab + 1085, 375, // smp -> Samr + 1089, 400, // sog -> Sogd + 1093, 460, // sou -> Thai + 1097, 90, // sr -> Cyrl + 1100, 405, // srb -> Sora + 1104, 95, // srx -> Deva + 1108, 10, // swb -> Arab + 1112, 95, // swv -> Deva + 1116, 40, // syl -> Beng + 1120, 415, // syr -> Syrc + 1124, 430, // ta -> Taml + 1127, 95, // taj -> Deva + 1131, 210, // tcy -> Knda + 1135, 420, // tdd -> Tale + 1139, 95, // tdg -> Deva + 1143, 95, // tdh -> Deva + 1147, 445, // te -> Telu + 1150, 90, // tg -> Cyrl + 1153, 10, // tg_PK -> Arab + 1159, 460, // th -> Thai + 1162, 95, // thl -> Deva + 1166, 95, // thq -> Deva + 1170, 95, // thr -> Deva + 1174, 105, // ti -> Ethi + 1177, 105, // tig -> Ethi + 1181, 95, // tkt -> Deva + 1185, 10, // trw -> Arab + 1189, 130, // tsd -> Grek + 1193, 95, // tsf -> Deva + 1197, 465, // tsj -> Tibt + 1201, 90, // tt -> Cyrl + 1204, 460, // tts -> Thai + 1208, 435, // txg -> Tang + 1212, 475, // txo -> Toto + 1216, 90, // tyv -> Cyrl + 1220, 0, // udi -> Aghb + 1224, 90, // udm -> Cyrl + 1228, 10, // ug -> Arab + 1231, 90, // ug_KZ -> Cyrl + 1237, 90, // ug_MN -> Cyrl + 1243, 480, // uga -> Ugar + 1247, 90, // uk -> Cyrl + 1250, 40, // unr -> Beng + 1254, 95, // unr_NP -> Deva + 1261, 40, // unx -> Beng + 1265, 10, // ur -> Arab + 1268, 10, // uz_AF -> Arab + 1274, 90, // uz_CN -> Cyrl + 1280, 485, // vai -> Vaii + 1284, 105, // wal -> Ethi + 1288, 445, // wbq -> Telu + 1292, 95, // wbr -> Deva + 1296, 10, // wni -> Arab + 1300, 115, // wsg -> Gong + 1304, 95, // wtm -> Deva + 1308, 145, // wuu -> Hans + 1312, 75, // xco -> Chrs + 1316, 60, // xcr -> Cari + 1320, 245, // xlc -> Lyci + 1324, 250, // xld -> Lydi + 1328, 110, // xmf -> Geor + 1332, 260, // xmn -> Mani + 1336, 270, // xmr -> Merc + 1340, 295, // xna -> Narb + 1344, 95, // xnr -> Deva + 1348, 360, // xpr -> Prti + 1352, 380, // xsa -> Sarb + 1356, 95, // xsr -> Deva + 1360, 155, // yi -> Hebr + 1363, 150, // yue -> Hant + 1367, 145, // yue_CN -> Hans + 1374, 10, // zdj -> Arab + 1378, 450, // zgh -> Tfng + 1382, 145, // zh -> Hans + 1385, 150, // zh_AU -> Hant + 1391, 150, // zh_BN -> Hant + 1397, 150, // zh_GB -> Hant + 1403, 150, // zh_GF -> Hant + 1409, 150, // zh_HK -> Hant + 1415, 150, // zh_ID -> Hant + 1421, 150, // zh_MO -> Hant + 1427, 150, // zh_PA -> Hant + 1433, 150, // zh_PF -> Hant + 1439, 150, // zh_PH -> Hant + 1445, 150, // zh_SR -> Hant + 1451, 150, // zh_TH -> Hant + 1457, 150, // zh_TW -> Hant + 1463, 150, // zh_US -> Hant + 1469, 150, // zh_VN -> Hant + 1475, 305, // zhx -> Nshu + 1479, 205, // zkt -> Kits +}; + +//====================================================================== +// Parent locale table +const char parentLocaleChars[] = + "az_Arab\0az_Cyrl\0bal_Latn\0blt_Latn\0bm_Nkoo\0bs_Cyrl\0byn_Latn\0" + "cu_Glag\0dje_Arab\0dyo_Arab\0en_001\0en_150\0en_AG\0en_AI\0en_AT\0" + "en_AU\0en_BB\0en_BE\0en_BM\0en_BS\0en_BW\0en_BZ\0en_CC\0en_CH\0" + "en_CK\0en_CM\0en_CX\0en_CY\0en_DE\0en_DG\0en_DK\0en_DM\0en_Dsrt\0" + "en_ER\0en_FI\0en_FJ\0en_FK\0en_FM\0en_GB\0en_GD\0en_GG\0en_GH\0" + "en_GI\0en_GM\0en_GY\0en_HK\0en_IE\0en_IL\0en_IM\0en_IN\0en_IO\0" + "en_JE\0en_JM\0en_KE\0en_KI\0en_KN\0en_KY\0en_LC\0en_LR\0en_LS\0" + "en_MG\0en_MO\0en_MS\0en_MT\0en_MU\0en_MV\0en_MW\0en_MY\0en_NA\0" + "en_NF\0en_NG\0en_NL\0en_NR\0en_NU\0en_NZ\0en_PG\0en_PK\0en_PN\0" + "en_PW\0en_RW\0en_SB\0en_SC\0en_SD\0en_SE\0en_SG\0en_SH\0en_SI\0" + "en_SL\0en_SS\0en_SX\0en_SZ\0en_Shaw\0en_TC\0en_TK\0en_TO\0en_TT\0" + "en_TV\0en_TZ\0en_UG\0en_VC\0en_VG\0en_VU\0en_WS\0en_ZA\0en_ZM\0" + "en_ZW\0es_419\0es_AR\0es_BO\0es_BR\0es_BZ\0es_CL\0es_CO\0es_CR\0" + "es_CU\0es_DO\0es_EC\0es_GT\0es_HN\0es_MX\0es_NI\0es_PA\0es_PE\0" + "es_PR\0es_PY\0es_SV\0es_US\0es_UY\0es_VE\0ff_Adlm\0ff_Arab\0fr_HT\0" + "ha_Arab\0hi_Latn\0ht\0iu_Latn\0kk_Arab\0ks_Deva\0ku_Arab\0ky_Arab\0" + "ky_Latn\0ml_Arab\0mn_Mong\0mni_Mtei\0ms_Arab\0nb\0nn\0no\0pa_Arab\0" + "pt_AO\0pt_CH\0pt_CV\0pt_FR\0pt_GQ\0pt_GW\0pt_LU\0pt_MO\0pt_MZ\0" + "pt_PT\0pt_ST\0pt_TL\0root\0sat_Deva\0sd_Deva\0sd_Khoj\0sd_Sind\0" + "shi_Latn\0so_Arab\0sr_Latn\0sw_Arab\0tg_Arab\0ug_Cyrl\0uz_Arab\0" + "uz_Cyrl\0vai_Latn\0wo_Arab\0yo_Arab\0yue_Hans\0zh_Hant\0zh_Hant_HK\0" + "zh_Hant_MO\0"; + +const int32_t parentLocaleTable[] = { + 0, 1017, // az_Arab -> root + 8, 1017, // az_Cyrl -> root + 16, 1017, // bal_Latn -> root + 25, 1017, // blt_Latn -> root + 34, 1017, // bm_Nkoo -> root + 42, 1017, // bs_Cyrl -> root + 50, 1017, // byn_Latn -> root + 59, 1017, // cu_Glag -> root + 67, 1017, // dje_Arab -> root + 76, 1017, // dyo_Arab -> root + 92, 85, // en_150 -> en_001 + 99, 85, // en_AG -> en_001 + 105, 85, // en_AI -> en_001 + 111, 92, // en_AT -> en_150 + 117, 85, // en_AU -> en_001 + 123, 85, // en_BB -> en_001 + 129, 92, // en_BE -> en_150 + 135, 85, // en_BM -> en_001 + 141, 85, // en_BS -> en_001 + 147, 85, // en_BW -> en_001 + 153, 85, // en_BZ -> en_001 + 159, 85, // en_CC -> en_001 + 165, 92, // en_CH -> en_150 + 171, 85, // en_CK -> en_001 + 177, 85, // en_CM -> en_001 + 183, 85, // en_CX -> en_001 + 189, 85, // en_CY -> en_001 + 195, 92, // en_DE -> en_150 + 201, 85, // en_DG -> en_001 + 207, 92, // en_DK -> en_150 + 213, 85, // en_DM -> en_001 + 219, 1017, // en_Dsrt -> root + 227, 85, // en_ER -> en_001 + 233, 92, // en_FI -> en_150 + 239, 85, // en_FJ -> en_001 + 245, 85, // en_FK -> en_001 + 251, 85, // en_FM -> en_001 + 257, 85, // en_GB -> en_001 + 263, 85, // en_GD -> en_001 + 269, 85, // en_GG -> en_001 + 275, 85, // en_GH -> en_001 + 281, 85, // en_GI -> en_001 + 287, 85, // en_GM -> en_001 + 293, 85, // en_GY -> en_001 + 299, 85, // en_HK -> en_001 + 305, 85, // en_IE -> en_001 + 311, 85, // en_IL -> en_001 + 317, 85, // en_IM -> en_001 + 323, 85, // en_IN -> en_001 + 329, 85, // en_IO -> en_001 + 335, 85, // en_JE -> en_001 + 341, 85, // en_JM -> en_001 + 347, 85, // en_KE -> en_001 + 353, 85, // en_KI -> en_001 + 359, 85, // en_KN -> en_001 + 365, 85, // en_KY -> en_001 + 371, 85, // en_LC -> en_001 + 377, 85, // en_LR -> en_001 + 383, 85, // en_LS -> en_001 + 389, 85, // en_MG -> en_001 + 395, 85, // en_MO -> en_001 + 401, 85, // en_MS -> en_001 + 407, 85, // en_MT -> en_001 + 413, 85, // en_MU -> en_001 + 419, 85, // en_MV -> en_001 + 425, 85, // en_MW -> en_001 + 431, 85, // en_MY -> en_001 + 437, 85, // en_NA -> en_001 + 443, 85, // en_NF -> en_001 + 449, 85, // en_NG -> en_001 + 455, 92, // en_NL -> en_150 + 461, 85, // en_NR -> en_001 + 467, 85, // en_NU -> en_001 + 473, 85, // en_NZ -> en_001 + 479, 85, // en_PG -> en_001 + 485, 85, // en_PK -> en_001 + 491, 85, // en_PN -> en_001 + 497, 85, // en_PW -> en_001 + 503, 85, // en_RW -> en_001 + 509, 85, // en_SB -> en_001 + 515, 85, // en_SC -> en_001 + 521, 85, // en_SD -> en_001 + 527, 92, // en_SE -> en_150 + 533, 85, // en_SG -> en_001 + 539, 85, // en_SH -> en_001 + 545, 92, // en_SI -> en_150 + 551, 85, // en_SL -> en_001 + 557, 85, // en_SS -> en_001 + 563, 85, // en_SX -> en_001 + 569, 85, // en_SZ -> en_001 + 575, 1017, // en_Shaw -> root + 583, 85, // en_TC -> en_001 + 589, 85, // en_TK -> en_001 + 595, 85, // en_TO -> en_001 + 601, 85, // en_TT -> en_001 + 607, 85, // en_TV -> en_001 + 613, 85, // en_TZ -> en_001 + 619, 85, // en_UG -> en_001 + 625, 85, // en_VC -> en_001 + 631, 85, // en_VG -> en_001 + 637, 85, // en_VU -> en_001 + 643, 85, // en_WS -> en_001 + 649, 85, // en_ZA -> en_001 + 655, 85, // en_ZM -> en_001 + 661, 85, // en_ZW -> en_001 + 674, 667, // es_AR -> es_419 + 680, 667, // es_BO -> es_419 + 686, 667, // es_BR -> es_419 + 692, 667, // es_BZ -> es_419 + 698, 667, // es_CL -> es_419 + 704, 667, // es_CO -> es_419 + 710, 667, // es_CR -> es_419 + 716, 667, // es_CU -> es_419 + 722, 667, // es_DO -> es_419 + 728, 667, // es_EC -> es_419 + 734, 667, // es_GT -> es_419 + 740, 667, // es_HN -> es_419 + 746, 667, // es_MX -> es_419 + 752, 667, // es_NI -> es_419 + 758, 667, // es_PA -> es_419 + 764, 667, // es_PE -> es_419 + 770, 667, // es_PR -> es_419 + 776, 667, // es_PY -> es_419 + 782, 667, // es_SV -> es_419 + 788, 667, // es_US -> es_419 + 794, 667, // es_UY -> es_419 + 800, 667, // es_VE -> es_419 + 806, 1017, // ff_Adlm -> root + 814, 1017, // ff_Arab -> root + 828, 1017, // ha_Arab -> root + 836, 323, // hi_Latn -> en_IN + 844, 822, // ht -> fr_HT + 847, 1017, // iu_Latn -> root + 855, 1017, // kk_Arab -> root + 863, 1017, // ks_Deva -> root + 871, 1017, // ku_Arab -> root + 879, 1017, // ky_Arab -> root + 887, 1017, // ky_Latn -> root + 895, 1017, // ml_Arab -> root + 903, 1017, // mn_Mong -> root + 911, 1017, // mni_Mtei -> root + 920, 1017, // ms_Arab -> root + 928, 934, // nb -> no + 931, 934, // nn -> no + 937, 1017, // pa_Arab -> root + 945, 999, // pt_AO -> pt_PT + 951, 999, // pt_CH -> pt_PT + 957, 999, // pt_CV -> pt_PT + 963, 999, // pt_FR -> pt_PT + 969, 999, // pt_GQ -> pt_PT + 975, 999, // pt_GW -> pt_PT + 981, 999, // pt_LU -> pt_PT + 987, 999, // pt_MO -> pt_PT + 993, 999, // pt_MZ -> pt_PT + 1005, 999, // pt_ST -> pt_PT + 1011, 999, // pt_TL -> pt_PT + 1022, 1017, // sat_Deva -> root + 1031, 1017, // sd_Deva -> root + 1039, 1017, // sd_Khoj -> root + 1047, 1017, // sd_Sind -> root + 1055, 1017, // shi_Latn -> root + 1064, 1017, // so_Arab -> root + 1072, 1017, // sr_Latn -> root + 1080, 1017, // sw_Arab -> root + 1088, 1017, // tg_Arab -> root + 1096, 1017, // ug_Cyrl -> root + 1104, 1017, // uz_Arab -> root + 1112, 1017, // uz_Cyrl -> root + 1120, 1017, // vai_Latn -> root + 1129, 1017, // wo_Arab -> root + 1137, 1017, // yo_Arab -> root + 1145, 1017, // yue_Hans -> root + 1154, 1017, // zh_Hant -> root + 1173, 1162, // zh_Hant_MO -> zh_Hant_HK +}; + + +#endif // INCLUDED_FROM_URESBUND_CPP diff --git a/icu4c/source/common/uresbund.cpp b/icu4c/source/common/uresbund.cpp index 272418679ae..17c0177a05c 100644 --- a/icu4c/source/common/uresbund.cpp +++ b/icu4c/source/common/uresbund.cpp @@ -91,6 +91,202 @@ static UBool chopLocale(char *name) { return false; } +static UBool hasVariant(const char* localeID) { + UErrorCode err = U_ZERO_ERROR; + int32_t variantLength = uloc_getVariant(localeID, NULL, 0, &err); + return variantLength != 0; +} + +// This file contains the tables for doing locale fallback, which are generated +// by the CLDR-to-ICU process directly from the CLDR data. This file should only +// ever be included from here. +#define INCLUDED_FROM_URESBUND_CPP +#include "localefallback_data.h" + +static const char* performFallbackLookup(const char* key, + const char* keyStrs, + const char* valueStrs, + const int32_t* lookupTable, + int32_t lookupTableLength) { + const int32_t* bottom = lookupTable; + const int32_t* top = lookupTable + lookupTableLength; + + while (bottom < top) { + // Effectively, divide by 2 and round down to an even index + const int32_t* middle = bottom + (((top - bottom) / 4) * 2); + const char* entryKey = &(keyStrs[*middle]); + int32_t strcmpResult = uprv_strcmp(key, entryKey); + if (strcmpResult == 0) { + return &(valueStrs[middle[1]]); + } else if (strcmpResult < 0) { + top = middle; + } else { + bottom = middle + 2; + } + } + return nullptr; +} + +static CharString getDefaultScript(const CharString& language, const CharString& region) { + const char* defaultScript = nullptr; + UErrorCode err = U_ZERO_ERROR; + + // the default script will be "Latn" if we don't find the locale ID in the tables + CharString result("Latn", err); + + // if we were passed both language and region, make them into a locale ID and look that up in the default + // script table + if (!region.isEmpty()) { + CharString localeID; + localeID.append(language, err).append("_", err).append(region, err); + if (U_FAILURE(err)) { + return result; + } + defaultScript = performFallbackLookup(localeID.data(), dsLocaleIDChars, scriptCodeChars, defaultScriptTable, UPRV_LENGTHOF(defaultScriptTable)); + } + + // if we didn't find anything, look up just the language in the default script table + if (defaultScript == nullptr) { + defaultScript = performFallbackLookup(language.data(), dsLocaleIDChars, scriptCodeChars, defaultScriptTable, UPRV_LENGTHOF(defaultScriptTable)); + } + + // if either lookup above succeeded, copy the result from "defaultScript" into "result"; otherwise, return "Latn" + if (defaultScript != nullptr) { + result.clear(); + result.append(defaultScript, err); + } + return result; +} + +enum UResOpenType { + /** + * Open a resource bundle for the locale; + * if there is not even a base language bundle, then fall back to the default locale; + * if there is no bundle for that either, then load the root bundle. + * + * This is the default bundle loading behavior. + */ + URES_OPEN_LOCALE_DEFAULT_ROOT, + // TODO: ICU ticket #11271 "consistent default locale across locale trees" + // Add an option to look at the main locale tree for whether to + // fall back to root directly (if the locale has main data) or + // fall back to the default locale first (if the locale does not even have main data). + /** + * Open a resource bundle for the locale; + * if there is not even a base language bundle, then load the root bundle; + * never fall back to the default locale. + * + * This is used for algorithms that have good pan-Unicode default behavior, + * such as case mappings, collation, and segmentation (BreakIterator). + */ + URES_OPEN_LOCALE_ROOT, + /** + * Open a resource bundle for the exact bundle name as requested; + * no fallbacks, do not load parent bundles. + * + * This is used for supplemental (non-locale) data. + */ + URES_OPEN_DIRECT +}; +typedef enum UResOpenType UResOpenType; + +/** + * Internal function, determines the search path for resource bundle files. + * Currently, this function is used only by findFirstExisting() to help search for resource bundle files when a bundle for the specified + * locale doesn't exist. The code that supports inheritance of resources between existing resource bundle files continues to + * use chopLocale() below. + * @param name In-out parameter: On input, the locale ID to get a parent locale ID for (this is a locale's base name, without keywords); on output, the + * requested parent locale ID. + * @param origName The original locale ID the caller of findFirstExisting() requested. This is the same as `name` on the first call to this function, + * but as findFirstExisting() ascends the resource bundle's parent tree, this parameter will continue to be the original locale ID requested. + */ +static bool getParentLocaleID(char *name, const char *origName, UResOpenType openType) { + // early out if the locale ID has a variant code or ends with _ + if (name[uprv_strlen(name) - 1] == '_' || hasVariant(name)) { + return chopLocale(name); + } + + UErrorCode err = U_ZERO_ERROR; + const char* tempNamePtr = name; + CharString language = ulocimp_getLanguage(tempNamePtr, &tempNamePtr, err); + if (*tempNamePtr == '_') { + ++tempNamePtr; + } + CharString script = ulocimp_getScript(tempNamePtr, &tempNamePtr, err); + if (*tempNamePtr == '_') { + ++tempNamePtr; + } + CharString region = ulocimp_getCountry(tempNamePtr, &tempNamePtr, err); + CharString workingLocale; + if (U_FAILURE(err)) { + // hopefully this never happens... + return chopLocale(name); + } + + // if the open type is URES_OPEN_LOCALE_DEFAULT_ROOT, first look the locale ID up in the parent locale table; + // if that table specifies a parent for it, return that (we don't do this for the other open types-- if we're not + // falling back through the system default locale, we also want to do straight truncation fallback instead + // of looking things up in the parent locale table-- see https://www.unicode.org/reports/tr35/tr35.html#Parent_Locales: + // "Collation data, however, is an exception...") + if (openType == URES_OPEN_LOCALE_DEFAULT_ROOT) { + const char* parentID = performFallbackLookup(name, parentLocaleChars, parentLocaleChars, parentLocaleTable, UPRV_LENGTHOF(parentLocaleTable)); + if (parentID != NULL) { + uprv_strcpy(name, parentID); + return true; + } + } + + // if it's not in the parent locale table, figure out the fallback script algorithmically + // (see CLDR-15265 for an explanation of the algorithm) + if (!script.isEmpty() && !region.isEmpty()) { + // if "name" has both script and region, is the script the default script? + // - if so, remove it and keep the region + // - if not, remove the region and keep the script + if (getDefaultScript(language, region) == script.toStringPiece()) { + workingLocale.append(language, err).append("_", err).append(region, err); + } else { + workingLocale.append(language, err).append("_", err).append(script, err); + } + } else if (!region.isEmpty()) { + // if "name" has region but not script, did the original locale ID specify a script? + // - if yes, replace the region with the script from the original locale ID + // - if no, replace the region with the default script for that language and region + UErrorCode err = U_ZERO_ERROR; + tempNamePtr = origName; + CharString origNameLanguage = ulocimp_getLanguage(tempNamePtr, &tempNamePtr, err); + if (*tempNamePtr == '_') { + ++tempNamePtr; + } + CharString origNameScript = ulocimp_getScript(origName, nullptr, err); + if (!origNameScript.isEmpty()) { + workingLocale.append(language, err).append("_", err).append(origNameScript, err); + } else { + workingLocale.append(language, err).append("_", err).append(getDefaultScript(language, region), err); + } + } else if (!script.isEmpty()) { + // if "name" has script but not region (and our open type if URES_OPEN_LOCALE_DEFAULT_ROOT), is the script + // the default script for the language? + // - if so, remove it from the locale ID + // - if not, return false to continue up the chain + // (we don't do this for other open types for the same reason we don't look things up in the parent + // locale table for other open types-- see the reference to UTS #35 above) + if (openType != URES_OPEN_LOCALE_DEFAULT_ROOT || getDefaultScript(language, CharString()) == script.toStringPiece()) { + workingLocale.append(language, err); + } else { + return false; + } + } else { + // if "name" just contains a language code, return false so the calling code falls back to "root" + return false; + } + if (U_SUCCESS(err) && !workingLocale.isEmpty()) { + uprv_strcpy(name, workingLocale.data()); + return true; + } else { + return false; + } +} + /** * Called to check whether a name without '_' needs to be checked for a parent. * Some code had assumed that locale IDs with '_' could not have a non-root parent. @@ -463,13 +659,15 @@ getPoolEntry(const char *path, UErrorCode *status) { /* INTERNAL: */ /* CAUTION: resbMutex must be locked when calling this function! */ static UResourceDataEntry * -findFirstExisting(const char* path, char* name, const char* defaultLocale, - UBool *isRoot, UBool *hasChopped, UBool *isDefault, UErrorCode* status) { +findFirstExisting(const char* path, char* name, const char* defaultLocale, UResOpenType openType, + UBool *isRoot, UBool *foundParent, UBool *isDefault, UErrorCode* status) { UResourceDataEntry *r = NULL; UBool hasRealData = false; - *hasChopped = true; /* we're starting with a fresh name */ + *foundParent = true; /* we're starting with a fresh name */ + char origName[ULOC_FULLNAME_CAPACITY]; - while(*hasChopped && !hasRealData) { + uprv_strcpy(origName, name); + while(*foundParent && !hasRealData) { r = init_entry(name, path, status); /* Null pointer test */ if (U_FAILURE(*status)) { @@ -494,8 +692,14 @@ findFirstExisting(const char* path, char* name, const char* defaultLocale, *isRoot = (UBool)(uprv_strcmp(name, kRootLocaleName) == 0); /*Fallback data stuff*/ - *hasChopped = chopLocale(name); - if (*hasChopped && *name == '\0') { + if (!hasRealData) { + *foundParent = getParentLocaleID(name, origName, openType); + } else { + // we've already found a real resource file; what we return to the caller is the parent + // locale ID for inheritance, which should come from chopLocale(), not getParentLocaleID() + *foundParent = chopLocale(name); + } + if (*foundParent && *name == '\0') { uprv_strcpy(name, "und"); } } @@ -602,38 +806,6 @@ insertRootBundle(UResourceDataEntry *&t1, UErrorCode *status) { return true; } -enum UResOpenType { - /** - * Open a resource bundle for the locale; - * if there is not even a base language bundle, then fall back to the default locale; - * if there is no bundle for that either, then load the root bundle. - * - * This is the default bundle loading behavior. - */ - URES_OPEN_LOCALE_DEFAULT_ROOT, - // TODO: ICU ticket #11271 "consistent default locale across locale trees" - // Add an option to look at the main locale tree for whether to - // fall back to root directly (if the locale has main data) or - // fall back to the default locale first (if the locale does not even have main data). - /** - * Open a resource bundle for the locale; - * if there is not even a base language bundle, then load the root bundle; - * never fall back to the default locale. - * - * This is used for algorithms that have good pan-Unicode default behavior, - * such as case mappings, collation, and segmentation (BreakIterator). - */ - URES_OPEN_LOCALE_ROOT, - /** - * Open a resource bundle for the exact bundle name as requested; - * no fallbacks, do not load parent bundles. - * - * This is used for supplemental (non-locale) data. - */ - URES_OPEN_DIRECT -}; -typedef enum UResOpenType UResOpenType; - static UResourceDataEntry *entryOpen(const char* path, const char* localeID, UResOpenType openType, UErrorCode* status) { U_ASSERT(openType != URES_OPEN_DIRECT); @@ -676,7 +848,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID, Mutex lock(&resbMutex); // Lock resbMutex until the end of this function. /* We're going to skip all the locales that do not have any data */ - r = findFirstExisting(path, name, defaultLocale, &isRoot, &hasChopped, &isDefault, &intStatus); + r = findFirstExisting(path, name, defaultLocale, openType, &isRoot, &hasChopped, &isDefault, &intStatus); // If we failed due to out-of-memory, report the failure and exit early. if (intStatus == U_MEMORY_ALLOCATION_ERROR) { @@ -717,7 +889,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID, if(r==NULL && openType == URES_OPEN_LOCALE_DEFAULT_ROOT && !isDefault && !isRoot) { /* insert default locale */ uprv_strcpy(name, defaultLocale); - r = findFirstExisting(path, name, defaultLocale, &isRoot, &hasChopped, &isDefault, &intStatus); + r = findFirstExisting(path, name, defaultLocale, openType, &isRoot, &hasChopped, &isDefault, &intStatus); // If we failed due to out-of-memory, report the failure and exit early. if (intStatus == U_MEMORY_ALLOCATION_ERROR) { *status = intStatus; @@ -741,7 +913,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID, /* present */ if(r == NULL) { uprv_strcpy(name, kRootLocaleName); - r = findFirstExisting(path, name, defaultLocale, &isRoot, &hasChopped, &isDefault, &intStatus); + r = findFirstExisting(path, name, defaultLocale, openType, &isRoot, &hasChopped, &isDefault, &intStatus); // If we failed due to out-of-memory, report the failure and exit early. if (intStatus == U_MEMORY_ALLOCATION_ERROR) { *status = intStatus; diff --git a/icu4c/source/test/cintltst/crestst.c b/icu4c/source/test/cintltst/crestst.c index bc2889e55af..4622dcb8eb9 100644 --- a/icu4c/source/test/cintltst/crestst.c +++ b/icu4c/source/test/cintltst/crestst.c @@ -26,6 +26,7 @@ #include "filestrm.h" #include #include +#include // for sprintf() #define RESTEST_HEAP_CHECK 0 @@ -38,6 +39,8 @@ static void TestOpenDirect(void); static void TestFallback(void); static void TestTable32(void); static void TestFileStream(void); +static void TestAlgorithmicParentFallback(void); + /*****************************************************************************/ const UChar kERROR[] = { 0x0045 /*E*/, 0x0052 /*'R'*/, 0x0052 /*'R'*/, @@ -120,6 +123,7 @@ void addResourceBundleTest(TestNode** root) #endif addTest(root, &TestFallback, "tsutil/crestst/TestFallback"); addTest(root, &TestAliasConflict, "tsutil/crestst/TestAliasConflict"); + addTest(root, &TestAlgorithmicParentFallback, "tsutil/crestst/TestAlgorithmicParentFallback"); } @@ -1050,3 +1054,60 @@ static void TestGetLocaleByType(void) { } ures_close(res); } + +static void TestAlgorithmicParentFallback(void) { + // Test for ICU-21125 and ICU-21126 -- cases where resource fallback isn't determined by lopping fields off + // the end of the locale ID (or following a %%Parent directive in a resource bundle) + // first column is input locale, second column is expected output locale + const char* testCases[] = { + "de_Latn_LI", "de_LI", "de_LI", +// "en_VA", "en_150", "en",// TODO: put this back in after https://unicode-org.atlassian.net/browse/CLDR-15893 is fixed + "yi_Latn_DE", "root", "yi", + "yi_Hebr_DE", "yi", "yi", + "zh_Hant_SG", "zh_Hant", "zh_Hant" + // would be nice to test that sr_Latn_ME falls back to sr_Latn, or sr_ME to sr_Latn_ME, + // or sr_Latn to root, but all of these resource bundle files actually exist in the project + }; + + for (int32_t i = 0; i < UPRV_LENGTHOF(testCases); i += 3) { + const char* testLocale = testCases[i]; + const char* regularExpected = testCases[i + 1]; + const char* noDefaultExpected = testCases[i + 2]; + + UErrorCode err = U_ZERO_ERROR; + UResourceBundle* regularRB = ures_open(NULL, testLocale, &err); + char errorMessage[200]; + + sprintf(errorMessage, "Error %s opening resource bundle for locale %s and URES_OPEN_LOCALE_DEFAULT_ROOT", u_errorName(err), testLocale); + if (assertSuccess(errorMessage, &err)) { + const char* resourceLocale = ures_getLocaleByType(regularRB, ULOC_ACTUAL_LOCALE, &err); + + sprintf(errorMessage, "Error %s getting resource locale for locale %s and URES_OPEN_LOCALE_DEFAULT_ROOT", u_errorName(err), testLocale); + if (assertSuccess(errorMessage, &err)) { + sprintf(errorMessage, "Mismatch for locale %s and URES_OPEN_LOCALE_DEFAULT_ROOT", testLocale); + if (uprv_strcmp(regularExpected, "root") == 0) { + // (the system default locale may have keywords-- just check if the resource locale (which won't) is a prefix of the system default) + assertTrue(errorMessage, uprv_strncmp(uloc_getDefault(), resourceLocale, uprv_strlen(resourceLocale)) == 0); + } else { + assertEquals(errorMessage, regularExpected, resourceLocale); + } + } + } + ures_close(regularRB); + + err = U_ZERO_ERROR; + UResourceBundle* noDefaultRB = ures_openNoDefault(NULL, testLocale, &err); + + sprintf(errorMessage, "Error %s opening resource bundle for locale %s and URES_OPEN_LOCALE_ROOT", u_errorName(err), testLocale); + if (assertSuccess(errorMessage, &err)) { + const char* resourceLocale = ures_getLocaleByType(noDefaultRB, ULOC_ACTUAL_LOCALE, &err); + + sprintf(errorMessage, "Error %s getting resource locale for locale %s and URES_OPEN_LOCALE_ROOT", u_errorName(err), testLocale); + if (assertSuccess(errorMessage, &err)) { + sprintf(errorMessage, "Mismatch for locale %s and URES_OPEN_LOCALE_ROOT", testLocale); + assertEquals(errorMessage, noDefaultExpected, resourceLocale); + } + } + ures_close(noDefaultRB); + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUResourceBundle.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUResourceBundle.java index 268c4522c28..4014d8e3054 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUResourceBundle.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUResourceBundle.java @@ -15,7 +15,9 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.EnumMap; import java.util.Enumeration; import java.util.HashMap; @@ -1244,10 +1246,10 @@ public class ICUResourceBundle extends UResourceBundle { localeID = ULocale.getBaseName(localeID); ICUResourceBundle b; if (openType == OpenType.LOCALE_DEFAULT_ROOT) { - b = instantiateBundle(baseName, localeID, ULocale.getDefault().getBaseName(), + b = instantiateBundle(baseName, localeID, null, ULocale.getDefault().getBaseName(), root, openType); } else { - b = instantiateBundle(baseName, localeID, null, root, openType); + b = instantiateBundle(baseName, localeID, null, null, root, openType); } if(b==null){ throw new MissingResourceException( @@ -1261,8 +1263,99 @@ public class ICUResourceBundle extends UResourceBundle { (localeID.length() == lang.length() || localeID.charAt(lang.length()) == '_'); } + private static final Comparator COMPARE_FIRST_ELEMENT = new Comparator() { + @Override + public int compare(String[] pair1, String[] pair2) { + return pair1[0].compareTo(pair2[0]); + } + }; + + private static String getExplicitParent(String localeID) { + return LocaleFallbackData.PARENT_LOCALE_TABLE.get(localeID); + } + + private static String getDefaultScript(String language, String region) { + String localeID = language + "_" + region; + String result = LocaleFallbackData.DEFAULT_SCRIPT_TABLE.get(localeID); + if (result == null) { + result = LocaleFallbackData.DEFAULT_SCRIPT_TABLE.get(language); + } + if (result == null) { + result = "Latn"; + } + return result; + } + + private static String getParentLocaleID(String name, String origName, OpenType openType) { + // early out if the locale ID has a variant code or ends with _ + if (name.endsWith("_") || !ULocale.getVariant(name).isEmpty()) { + int lastUnderbarPos = name.lastIndexOf('_'); + if (lastUnderbarPos >= 0) { + return name.substring(0, lastUnderbarPos); + } else { + return null; + } + } + + // TODO: Is there a better way to break the locale ID up into its consituent parts? + ULocale nameLocale = new ULocale(name); + String language = nameLocale.getLanguage(); + String script = nameLocale.getScript(); + String region = nameLocale.getCountry(); + + // if our open type is LOCALE_DEFAULT_ROOT, first look the locale ID up in the parent locale table; if that + // table specifies a parent for it, return that (we don't do this for the other open types-- if we're not + // falling back through the system default locale, we also want to do straight truncation fallback instead + // of looking things up in the parent locale table-- see https://www.unicode.org/reports/tr35/tr35.html#Parent_Locales: + // "Collation data, however, is an exception...") + if (openType == OpenType.LOCALE_DEFAULT_ROOT) { + String parentID = getExplicitParent(name); + if (parentID != null) { + return parentID.equals("root") ? null : parentID; + } + } + + // if it's not in the parent locale table, figure out the fallback script algorithmically + // (see CLDR-15265 for an explanation of the algorithm) + if (!script.isEmpty() && !region.isEmpty()) { + // if "name" has both script and region, is the script the default script? + // - if so, remove it and keep the region + // - if not, remove the region and keep the script + if (getDefaultScript(language, region).equals(script)) { + return language + "_" + region; + } else { + return language + "_" + script; + } + } else if (!region.isEmpty()) { + // if "name" has region but not script, did the original locale ID specify a script? + // - if yes, replace the region with the script from the original locale ID + // - if no, replace the region with the default script for that language and region + String origNameScript = ULocale.getScript(origName); + if (!origNameScript.isEmpty()) { + return language + "_" + origNameScript; + } else { + return language + "_" + getDefaultScript(language, region); + } + } else if (!script.isEmpty()) { + // if "name" has script but not region (and our open type is LOCALE_DEFAULT_ROOT), is the script the + // default script for the language? + // - if so, remove it from the locale ID + // - if not, return "root" (bypassing the system default locale ID) + // (we don't do this for other open types for the same reason we don't look things up in the parent + // locale table for other open types-- see the reference to UTS #35 above) + if (openType != OpenType.LOCALE_DEFAULT_ROOT || getDefaultScript(language, null).equals(script)) { + return language; + } else { + return /*"root"*/null; + } + } else { + // if "name" just contains a language code, return null so the calling code falls back to "root" + return null; + } + } + private static ICUResourceBundle instantiateBundle( - final String baseName, final String localeID, final String defaultID, + final String baseName, final String localeID, final String origLocaleID, final String defaultID, final ClassLoader root, final OpenType openType) { assert localeID.indexOf('@') < 0; assert defaultID == null || defaultID.indexOf('@') < 0; @@ -1304,17 +1397,15 @@ public class ICUResourceBundle extends UResourceBundle { // fallback to locale ID parent if(b == null){ - int i = localeName.lastIndexOf('_'); - if (i != -1) { - // Chop off the last underscore and the subtag after that. - String temp = localeName.substring(0, i); - b = instantiateBundle(baseName, temp, defaultID, root, openType); + String origLocaleName = (origLocaleID != null) ? origLocaleID : localeName; + String fallbackLocaleID = getParentLocaleID(localeName, origLocaleName, openType); + if (fallbackLocaleID != null) { + b = instantiateBundle(baseName, fallbackLocaleID, origLocaleName, defaultID, root, openType); }else{ - // No underscore, only a base language subtag. if(openType == OpenType.LOCALE_DEFAULT_ROOT && !localeIDStartsWithLangSubtag(defaultID, localeName)) { // Go to the default locale before root. - b = instantiateBundle(baseName, defaultID, defaultID, root, openType); + b = instantiateBundle(baseName, defaultID, null, defaultID, root, openType); } else if(openType != OpenType.LOCALE_ONLY && !rootLocale.isEmpty()) { // Ultimately go to root. b = ICUResourceBundle.createBundle(baseName, rootLocale, root); @@ -1328,11 +1419,11 @@ public class ICUResourceBundle extends UResourceBundle { // TODO: C++ uresbund.cpp also checks for %%ParentIsRoot. Why not Java? String parentLocaleName = ((ICUResourceBundleImpl.ResourceTable)b).findString("%%Parent"); if (parentLocaleName != null) { - parent = instantiateBundle(baseName, parentLocaleName, defaultID, root, openType); + parent = instantiateBundle(baseName, parentLocaleName, null, defaultID, root, openType); } else if (i != -1) { - parent = instantiateBundle(baseName, localeName.substring(0, i), defaultID, root, openType); + parent = instantiateBundle(baseName, localeName.substring(0, i), null, defaultID, root, openType); } else if (!localeName.equals(rootLocale)){ - parent = instantiateBundle(baseName, rootLocale, defaultID, root, openType); + parent = instantiateBundle(baseName, rootLocale, null, defaultID, root, openType); } if (!b.equals(parent)){ diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleFallbackData.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleFallbackData.java new file mode 100644 index 00000000000..ea42151da72 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleFallbackData.java @@ -0,0 +1,574 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// +// Internal static data tables used by ICUResourceBundle.java +// WARNING: This file is mechanically generated by the CLDR-to-ICU tool +// (see tools/cldr/cldr-to-icu/src/main/java/org/unicode/tool/cldrtoicu/generator/ResourcFallbackCodeGenerator.java). +// DO NOT HAND EDIT!!! + +package com.ibm.icu.impl; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +class LocaleFallbackData { + //====================================================================== + // Default script table + public static final Map DEFAULT_SCRIPT_TABLE = buildDefaultScriptTable(); + + private static Map buildDefaultScriptTable() { + Map t = new HashMap<>(); + t.put("ab", "Cyrl"); + t.put("abq", "Cyrl"); + t.put("adp", "Tibt"); + t.put("ady", "Cyrl"); + t.put("ae", "Avst"); + t.put("aeb", "Arab"); + t.put("aho", "Ahom"); + t.put("akk", "Xsux"); + t.put("alt", "Cyrl"); + t.put("am", "Ethi"); + t.put("apc", "Arab"); + t.put("apd", "Arab"); + t.put("ar", "Arab"); + t.put("arc", "Armi"); + t.put("arq", "Arab"); + t.put("ars", "Arab"); + t.put("ary", "Arab"); + t.put("arz", "Arab"); + t.put("as", "Beng"); + t.put("ase", "Sgnw"); + t.put("av", "Cyrl"); + t.put("avl", "Arab"); + t.put("awa", "Deva"); + t.put("az_IQ", "Arab"); + t.put("az_IR", "Arab"); + t.put("az_RU", "Cyrl"); + t.put("ba", "Cyrl"); + t.put("bal", "Arab"); + t.put("bap", "Deva"); + t.put("bax", "Bamu"); + t.put("bcq", "Ethi"); + t.put("be", "Cyrl"); + t.put("bej", "Arab"); + t.put("bfq", "Taml"); + t.put("bft", "Arab"); + t.put("bfy", "Deva"); + t.put("bg", "Cyrl"); + t.put("bgc", "Deva"); + t.put("bgn", "Arab"); + t.put("bgx", "Grek"); + t.put("bhb", "Deva"); + t.put("bhi", "Deva"); + t.put("bho", "Deva"); + t.put("bji", "Ethi"); + t.put("bjj", "Deva"); + t.put("blt", "Tavt"); + t.put("bn", "Beng"); + t.put("bo", "Tibt"); + t.put("bpy", "Beng"); + t.put("bqi", "Arab"); + t.put("bra", "Deva"); + t.put("brh", "Arab"); + t.put("brx", "Deva"); + t.put("bsq", "Bass"); + t.put("bst", "Ethi"); + t.put("btv", "Deva"); + t.put("bua", "Cyrl"); + t.put("byn", "Ethi"); + t.put("ccp", "Cakm"); + t.put("ce", "Cyrl"); + t.put("chm", "Cyrl"); + t.put("chr", "Cher"); + t.put("cja", "Arab"); + t.put("cjm", "Cham"); + t.put("ckb", "Arab"); + t.put("cmg", "Soyo"); + t.put("cop", "Copt"); + t.put("cr", "Cans"); + t.put("crh", "Cyrl"); + t.put("crk", "Cans"); + t.put("crl", "Cans"); + t.put("csw", "Cans"); + t.put("ctd", "Pauc"); + t.put("cu", "Cyrl"); + t.put("cv", "Cyrl"); + t.put("dar", "Cyrl"); + t.put("dcc", "Arab"); + t.put("dgl", "Arab"); + t.put("dmf", "Medf"); + t.put("doi", "Deva"); + t.put("drh", "Mong"); + t.put("drs", "Ethi"); + t.put("dty", "Deva"); + t.put("dv", "Thaa"); + t.put("dz", "Tibt"); + t.put("egy", "Egyp"); + t.put("eky", "Kali"); + t.put("el", "Grek"); + t.put("esg", "Gonm"); + t.put("ett", "Ital"); + t.put("fa", "Arab"); + t.put("fia", "Arab"); + t.put("fub", "Arab"); + t.put("gan", "Hans"); + t.put("gbm", "Deva"); + t.put("gbz", "Arab"); + t.put("gez", "Ethi"); + t.put("ggn", "Deva"); + t.put("gjk", "Arab"); + t.put("gju", "Arab"); + t.put("glk", "Arab"); + t.put("gmv", "Ethi"); + t.put("gof", "Ethi"); + t.put("gom", "Deva"); + t.put("gon", "Telu"); + t.put("got", "Goth"); + t.put("grc", "Cprt"); + t.put("grt", "Beng"); + t.put("gu", "Gujr"); + t.put("gvr", "Deva"); + t.put("gwc", "Arab"); + t.put("gwt", "Arab"); + t.put("ha_CM", "Arab"); + t.put("ha_SD", "Arab"); + t.put("hak", "Hans"); + t.put("haz", "Arab"); + t.put("hdy", "Ethi"); + t.put("he", "Hebr"); + t.put("hi", "Deva"); + t.put("hlu", "Hluw"); + t.put("hmd", "Plrd"); + t.put("hnd", "Arab"); + t.put("hne", "Deva"); + t.put("hnj", "Hmnp"); + t.put("hno", "Arab"); + t.put("hoc", "Deva"); + t.put("hoj", "Deva"); + t.put("hsn", "Hans"); + t.put("hy", "Armn"); + t.put("ii", "Yiii"); + t.put("inh", "Cyrl"); + t.put("iu", "Cans"); + t.put("iw", "Hebr"); + t.put("ja", "Jpan"); + t.put("ji", "Hebr"); + t.put("jml", "Deva"); + t.put("ka", "Geor"); + t.put("kaa", "Cyrl"); + t.put("kaw", "Kawi"); + t.put("kbd", "Cyrl"); + t.put("kby", "Arab"); + t.put("kdt", "Thai"); + t.put("kfr", "Deva"); + t.put("kfy", "Deva"); + t.put("khb", "Talu"); + t.put("khn", "Deva"); + t.put("kht", "Mymr"); + t.put("khw", "Arab"); + t.put("kjg", "Laoo"); + t.put("kk", "Cyrl"); + t.put("kk_AF", "Arab"); + t.put("kk_CN", "Arab"); + t.put("kk_IR", "Arab"); + t.put("kk_MN", "Arab"); + t.put("km", "Khmr"); + t.put("kn", "Knda"); + t.put("ko", "Kore"); + t.put("koi", "Cyrl"); + t.put("kok", "Deva"); + t.put("kqy", "Ethi"); + t.put("krc", "Cyrl"); + t.put("kru", "Deva"); + t.put("ks", "Arab"); + t.put("ktb", "Ethi"); + t.put("ku_LB", "Arab"); + t.put("kum", "Cyrl"); + t.put("kv", "Cyrl"); + t.put("kvx", "Arab"); + t.put("kxc", "Ethi"); + t.put("kxl", "Deva"); + t.put("kxm", "Thai"); + t.put("kxp", "Arab"); + t.put("ky", "Cyrl"); + t.put("ky_CN", "Arab"); + t.put("kzh", "Arab"); + t.put("lab", "Lina"); + t.put("lad", "Hebr"); + t.put("lah", "Arab"); + t.put("lbe", "Cyrl"); + t.put("lcp", "Thai"); + t.put("lep", "Lepc"); + t.put("lez", "Cyrl"); + t.put("lif", "Deva"); + t.put("lis", "Lisu"); + t.put("lki", "Arab"); + t.put("lmn", "Telu"); + t.put("lo", "Laoo"); + t.put("lrc", "Arab"); + t.put("luz", "Arab"); + t.put("lwl", "Thai"); + t.put("lzh", "Hans"); + t.put("mag", "Deva"); + t.put("mai", "Deva"); + t.put("man_GN", "Nkoo"); + t.put("mde", "Arab"); + t.put("mdf", "Cyrl"); + t.put("mdx", "Ethi"); + t.put("mfa", "Arab"); + t.put("mgp", "Deva"); + t.put("mk", "Cyrl"); + t.put("mki", "Arab"); + t.put("ml", "Mlym"); + t.put("mn", "Cyrl"); + t.put("mn_CN", "Mong"); + t.put("mni", "Beng"); + t.put("mnw", "Mymr"); + t.put("mr", "Deva"); + t.put("mrd", "Deva"); + t.put("mrj", "Cyrl"); + t.put("mro", "Mroo"); + t.put("ms_CC", "Arab"); + t.put("mtr", "Deva"); + t.put("mvy", "Arab"); + t.put("mwr", "Deva"); + t.put("mww", "Hmnp"); + t.put("my", "Mymr"); + t.put("mym", "Ethi"); + t.put("myv", "Cyrl"); + t.put("myz", "Mand"); + t.put("mzn", "Arab"); + t.put("nan", "Hans"); + t.put("ne", "Deva"); + t.put("new", "Deva"); + t.put("nnp", "Wcho"); + t.put("nod", "Lana"); + t.put("noe", "Deva"); + t.put("non", "Runr"); + t.put("nqo", "Nkoo"); + t.put("nsk", "Cans"); + t.put("nst", "Tnsa"); + t.put("oj", "Cans"); + t.put("ojs", "Cans"); + t.put("or", "Orya"); + t.put("oru", "Arab"); + t.put("os", "Cyrl"); + t.put("osa", "Osge"); + t.put("ota", "Arab"); + t.put("otk", "Orkh"); + t.put("oui", "Ougr"); + t.put("pa", "Guru"); + t.put("pa_PK", "Arab"); + t.put("pal", "Phli"); + t.put("peo", "Xpeo"); + t.put("phl", "Arab"); + t.put("phn", "Phnx"); + t.put("pka", "Brah"); + t.put("pnt", "Grek"); + t.put("ppa", "Deva"); + t.put("pra", "Khar"); + t.put("prd", "Arab"); + t.put("ps", "Arab"); + t.put("raj", "Deva"); + t.put("rhg", "Rohg"); + t.put("rif", "Tfng"); + t.put("rjs", "Deva"); + t.put("rkt", "Beng"); + t.put("rmt", "Arab"); + t.put("ru", "Cyrl"); + t.put("rue", "Cyrl"); + t.put("ryu", "Kana"); + t.put("sa", "Deva"); + t.put("sah", "Cyrl"); + t.put("sat", "Olck"); + t.put("saz", "Saur"); + t.put("sck", "Deva"); + t.put("scl", "Arab"); + t.put("sd", "Arab"); + t.put("sd_IN", "Deva"); + t.put("sdh", "Arab"); + t.put("sga", "Ogam"); + t.put("sgw", "Ethi"); + t.put("shi", "Tfng"); + t.put("shn", "Mymr"); + t.put("shu", "Arab"); + t.put("si", "Sinh"); + t.put("skr", "Arab"); + t.put("smp", "Samr"); + t.put("sog", "Sogd"); + t.put("sou", "Thai"); + t.put("sr", "Cyrl"); + t.put("srb", "Sora"); + t.put("srx", "Deva"); + t.put("swb", "Arab"); + t.put("swv", "Deva"); + t.put("syl", "Beng"); + t.put("syr", "Syrc"); + t.put("ta", "Taml"); + t.put("taj", "Deva"); + t.put("tcy", "Knda"); + t.put("tdd", "Tale"); + t.put("tdg", "Deva"); + t.put("tdh", "Deva"); + t.put("te", "Telu"); + t.put("tg", "Cyrl"); + t.put("tg_PK", "Arab"); + t.put("th", "Thai"); + t.put("thl", "Deva"); + t.put("thq", "Deva"); + t.put("thr", "Deva"); + t.put("ti", "Ethi"); + t.put("tig", "Ethi"); + t.put("tkt", "Deva"); + t.put("trw", "Arab"); + t.put("tsd", "Grek"); + t.put("tsf", "Deva"); + t.put("tsj", "Tibt"); + t.put("tt", "Cyrl"); + t.put("tts", "Thai"); + t.put("txg", "Tang"); + t.put("txo", "Toto"); + t.put("tyv", "Cyrl"); + t.put("udi", "Aghb"); + t.put("udm", "Cyrl"); + t.put("ug", "Arab"); + t.put("ug_KZ", "Cyrl"); + t.put("ug_MN", "Cyrl"); + t.put("uga", "Ugar"); + t.put("uk", "Cyrl"); + t.put("unr", "Beng"); + t.put("unr_NP", "Deva"); + t.put("unx", "Beng"); + t.put("ur", "Arab"); + t.put("uz_AF", "Arab"); + t.put("uz_CN", "Cyrl"); + t.put("vai", "Vaii"); + t.put("wal", "Ethi"); + t.put("wbq", "Telu"); + t.put("wbr", "Deva"); + t.put("wni", "Arab"); + t.put("wsg", "Gong"); + t.put("wtm", "Deva"); + t.put("wuu", "Hans"); + t.put("xco", "Chrs"); + t.put("xcr", "Cari"); + t.put("xlc", "Lyci"); + t.put("xld", "Lydi"); + t.put("xmf", "Geor"); + t.put("xmn", "Mani"); + t.put("xmr", "Merc"); + t.put("xna", "Narb"); + t.put("xnr", "Deva"); + t.put("xpr", "Prti"); + t.put("xsa", "Sarb"); + t.put("xsr", "Deva"); + t.put("yi", "Hebr"); + t.put("yue", "Hant"); + t.put("yue_CN", "Hans"); + t.put("zdj", "Arab"); + t.put("zgh", "Tfng"); + t.put("zh", "Hans"); + t.put("zh_AU", "Hant"); + t.put("zh_BN", "Hant"); + t.put("zh_GB", "Hant"); + t.put("zh_GF", "Hant"); + t.put("zh_HK", "Hant"); + t.put("zh_ID", "Hant"); + t.put("zh_MO", "Hant"); + t.put("zh_PA", "Hant"); + t.put("zh_PF", "Hant"); + t.put("zh_PH", "Hant"); + t.put("zh_SR", "Hant"); + t.put("zh_TH", "Hant"); + t.put("zh_TW", "Hant"); + t.put("zh_US", "Hant"); + t.put("zh_VN", "Hant"); + t.put("zhx", "Nshu"); + t.put("zkt", "Kits"); + return Collections.unmodifiableMap(t); + } + + //====================================================================== + // Parent locale table + public static final Map PARENT_LOCALE_TABLE = buildParentLocaleTable(); + + private static Map buildParentLocaleTable() { + Map t = new HashMap<>(); + t.put("az_Arab", "root"); + t.put("az_Cyrl", "root"); + t.put("bal_Latn", "root"); + t.put("blt_Latn", "root"); + t.put("bm_Nkoo", "root"); + t.put("bs_Cyrl", "root"); + t.put("byn_Latn", "root"); + t.put("cu_Glag", "root"); + t.put("dje_Arab", "root"); + t.put("dyo_Arab", "root"); + t.put("en_150", "en_001"); + t.put("en_AG", "en_001"); + t.put("en_AI", "en_001"); + t.put("en_AT", "en_150"); + t.put("en_AU", "en_001"); + t.put("en_BB", "en_001"); + t.put("en_BE", "en_150"); + t.put("en_BM", "en_001"); + t.put("en_BS", "en_001"); + t.put("en_BW", "en_001"); + t.put("en_BZ", "en_001"); + t.put("en_CC", "en_001"); + t.put("en_CH", "en_150"); + t.put("en_CK", "en_001"); + t.put("en_CM", "en_001"); + t.put("en_CX", "en_001"); + t.put("en_CY", "en_001"); + t.put("en_DE", "en_150"); + t.put("en_DG", "en_001"); + t.put("en_DK", "en_150"); + t.put("en_DM", "en_001"); + t.put("en_Dsrt", "root"); + t.put("en_ER", "en_001"); + t.put("en_FI", "en_150"); + t.put("en_FJ", "en_001"); + t.put("en_FK", "en_001"); + t.put("en_FM", "en_001"); + t.put("en_GB", "en_001"); + t.put("en_GD", "en_001"); + t.put("en_GG", "en_001"); + t.put("en_GH", "en_001"); + t.put("en_GI", "en_001"); + t.put("en_GM", "en_001"); + t.put("en_GY", "en_001"); + t.put("en_HK", "en_001"); + t.put("en_IE", "en_001"); + t.put("en_IL", "en_001"); + t.put("en_IM", "en_001"); + t.put("en_IN", "en_001"); + t.put("en_IO", "en_001"); + t.put("en_JE", "en_001"); + t.put("en_JM", "en_001"); + t.put("en_KE", "en_001"); + t.put("en_KI", "en_001"); + t.put("en_KN", "en_001"); + t.put("en_KY", "en_001"); + t.put("en_LC", "en_001"); + t.put("en_LR", "en_001"); + t.put("en_LS", "en_001"); + t.put("en_MG", "en_001"); + t.put("en_MO", "en_001"); + t.put("en_MS", "en_001"); + t.put("en_MT", "en_001"); + t.put("en_MU", "en_001"); + t.put("en_MV", "en_001"); + t.put("en_MW", "en_001"); + t.put("en_MY", "en_001"); + t.put("en_NA", "en_001"); + t.put("en_NF", "en_001"); + t.put("en_NG", "en_001"); + t.put("en_NL", "en_150"); + t.put("en_NR", "en_001"); + t.put("en_NU", "en_001"); + t.put("en_NZ", "en_001"); + t.put("en_PG", "en_001"); + t.put("en_PK", "en_001"); + t.put("en_PN", "en_001"); + t.put("en_PW", "en_001"); + t.put("en_RW", "en_001"); + t.put("en_SB", "en_001"); + t.put("en_SC", "en_001"); + t.put("en_SD", "en_001"); + t.put("en_SE", "en_150"); + t.put("en_SG", "en_001"); + t.put("en_SH", "en_001"); + t.put("en_SI", "en_150"); + t.put("en_SL", "en_001"); + t.put("en_SS", "en_001"); + t.put("en_SX", "en_001"); + t.put("en_SZ", "en_001"); + t.put("en_Shaw", "root"); + t.put("en_TC", "en_001"); + t.put("en_TK", "en_001"); + t.put("en_TO", "en_001"); + t.put("en_TT", "en_001"); + t.put("en_TV", "en_001"); + t.put("en_TZ", "en_001"); + t.put("en_UG", "en_001"); + t.put("en_VC", "en_001"); + t.put("en_VG", "en_001"); + t.put("en_VU", "en_001"); + t.put("en_WS", "en_001"); + t.put("en_ZA", "en_001"); + t.put("en_ZM", "en_001"); + t.put("en_ZW", "en_001"); + t.put("es_AR", "es_419"); + t.put("es_BO", "es_419"); + t.put("es_BR", "es_419"); + t.put("es_BZ", "es_419"); + t.put("es_CL", "es_419"); + t.put("es_CO", "es_419"); + t.put("es_CR", "es_419"); + t.put("es_CU", "es_419"); + t.put("es_DO", "es_419"); + t.put("es_EC", "es_419"); + t.put("es_GT", "es_419"); + t.put("es_HN", "es_419"); + t.put("es_MX", "es_419"); + t.put("es_NI", "es_419"); + t.put("es_PA", "es_419"); + t.put("es_PE", "es_419"); + t.put("es_PR", "es_419"); + t.put("es_PY", "es_419"); + t.put("es_SV", "es_419"); + t.put("es_US", "es_419"); + t.put("es_UY", "es_419"); + t.put("es_VE", "es_419"); + t.put("ff_Adlm", "root"); + t.put("ff_Arab", "root"); + t.put("ha_Arab", "root"); + t.put("hi_Latn", "en_IN"); + t.put("ht", "fr_HT"); + t.put("iu_Latn", "root"); + t.put("kk_Arab", "root"); + t.put("ks_Deva", "root"); + t.put("ku_Arab", "root"); + t.put("ky_Arab", "root"); + t.put("ky_Latn", "root"); + t.put("ml_Arab", "root"); + t.put("mn_Mong", "root"); + t.put("mni_Mtei", "root"); + t.put("ms_Arab", "root"); + t.put("nb", "no"); + t.put("nn", "no"); + t.put("pa_Arab", "root"); + t.put("pt_AO", "pt_PT"); + t.put("pt_CH", "pt_PT"); + t.put("pt_CV", "pt_PT"); + t.put("pt_FR", "pt_PT"); + t.put("pt_GQ", "pt_PT"); + t.put("pt_GW", "pt_PT"); + t.put("pt_LU", "pt_PT"); + t.put("pt_MO", "pt_PT"); + t.put("pt_MZ", "pt_PT"); + t.put("pt_ST", "pt_PT"); + t.put("pt_TL", "pt_PT"); + t.put("sat_Deva", "root"); + t.put("sd_Deva", "root"); + t.put("sd_Khoj", "root"); + t.put("sd_Sind", "root"); + t.put("shi_Latn", "root"); + t.put("so_Arab", "root"); + t.put("sr_Latn", "root"); + t.put("sw_Arab", "root"); + t.put("tg_Arab", "root"); + t.put("ug_Cyrl", "root"); + t.put("uz_Arab", "root"); + t.put("uz_Cyrl", "root"); + t.put("vai_Latn", "root"); + t.put("wo_Arab", "root"); + t.put("yo_Arab", "root"); + t.put("yue_Hans", "root"); + t.put("zh_Hant", "root"); + t.put("zh_Hant_MO", "zh_Hant_HK"); + return Collections.unmodifiableMap(t); + } +} diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/util/ULocaleCollationTest.java b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/util/ULocaleCollationTest.java index d92e8b8c6e1..170e51c12dc 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/util/ULocaleCollationTest.java +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/util/ULocaleCollationTest.java @@ -242,10 +242,10 @@ public class ULocaleCollationTest extends TestFmwk { public void TestNameList() { String[][][] tests = { /* name in French, name in self, minimized, modified */ - {{"fr-Cyrl-BE", "fr-Cyrl-CA"}, - {"Français (cyrillique, Belgique)", "Français (cyrillique, Belgique)", "fr_Cyrl_BE", "fr_Cyrl_BE"}, - {"Français (cyrillique, Canada)", "Français (cyrillique, Canada)", "fr_Cyrl_CA", "fr_Cyrl_CA"}, - }, +// {{"fr-Cyrl-BE", "fr-Cyrl-CA"}, +// {"Français (cyrillique, Belgique)", "Français (cyrillique, Belgique)", "fr_Cyrl_BE", "fr_Cyrl_BE"}, +// {"Français (cyrillique, Canada)", "Français (cyrillique, Canada)", "fr_Cyrl_CA", "fr_Cyrl_CA"}, +// }, {{"en", "de", "fr", "zh"}, {"Allemand", "Deutsch", "de", "de"}, {"Anglais", "English", "en", "en"}, @@ -253,14 +253,14 @@ public class ULocaleCollationTest extends TestFmwk { {"Français", "Français", "fr", "fr"}, }, // some non-canonical names - {{"iw", "iw-US", "no", "no-Cyrl", "in", "in-YU"}, - {"Hébreu (États-Unis)", "עברית (ארצות הברית)", "iw_US", "iw_US"}, - {"Hébreu (Israël)", "עברית (ישראל)", "iw", "iw_IL"}, - {"Indonésien (Indonésie)", "Indonesia (Indonesia)", "in", "in_ID"}, - {"Indonésien (Serbie)", "Indonesia (Serbia)", "in_YU", "in_YU"}, - {"Norvégien (cyrillique)", "Norsk (kyrillisk)", "no_Cyrl", "no_Cyrl"}, - {"Norvégien (latin)", "Norsk (latinsk)", "no", "no_Latn"}, - }, +// {{"iw", "iw-US", "no", "no-Cyrl", "in", "in-YU"}, +// {"Hébreu (États-Unis)", "עברית (ארצות הברית)", "iw_US", "iw_US"}, +// {"Hébreu (Israël)", "עברית (ישראל)", "iw", "iw_IL"}, +// {"Indonésien (Indonésie)", "Indonesia (Indonesia)", "in", "in_ID"}, +// {"Indonésien (Serbie)", "Indonesia (Serbia)", "in_YU", "in_YU"}, +// {"Norvégien (cyrillique)", "Norsk (kyrillisk)", "no_Cyrl", "no_Cyrl"}, +// {"Norvégien (latin)", "Norsk (latinsk)", "no", "no_Latn"}, +// }, {{"zh-Hant-TW", "en", "en-gb", "fr", "zh-Hant", "de", "de-CH", "zh-TW"}, {"Allemand (Allemagne)", "Deutsch (Deutschland)", "de", "de_DE"}, {"Allemand (Suisse)", "Deutsch (Schweiz)", "de_CH", "de_CH"}, @@ -283,14 +283,14 @@ public class ULocaleCollationTest extends TestFmwk { {"Serbe (cyrillique)", "Српски (ћирилица)", "sr", "sr_Cyrl"}, {"Serbe (latin)", "Srpski (latinica)", "sr_Latn", "sr_Latn"}, }, - {{"fr-Cyrl", "fr-Arab"}, - {"Français (arabe)", "Français (arabe)", "fr_Arab", "fr_Arab"}, - {"Français (cyrillique)", "Français (cyrillique)", "fr_Cyrl", "fr_Cyrl"}, - }, - {{"fr-Cyrl-BE", "fr-Arab-CA"}, - {"Français (arabe, Canada)", "Français (arabe, Canada)", "fr_Arab_CA", "fr_Arab_CA"}, - {"Français (cyrillique, Belgique)", "Français (cyrillique, Belgique)", "fr_Cyrl_BE", "fr_Cyrl_BE"}, - } +// {{"fr-Cyrl", "fr-Arab"}, +// {"Français (arabe)", "Français (arabe)", "fr_Arab", "fr_Arab"}, +// {"Français (cyrillique)", "Français (cyrillique)", "fr_Cyrl", "fr_Cyrl"}, +// }, +// {{"fr-Cyrl-BE", "fr-Arab-CA"}, +// {"Français (arabe, Canada)", "Français (arabe, Canada)", "fr_Arab_CA", "fr_Arab_CA"}, +// {"Français (cyrillique, Belgique)", "Français (cyrillique, Belgique)", "fr_Cyrl_BE", "fr_Cyrl_BE"}, +// } }; ULocale french = ULocale.FRENCH; LocaleDisplayNames names = LocaleDisplayNames.getInstance(french, diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java index fc99e716215..f60869b7cf1 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ICUResourceBundleTest.java @@ -1144,4 +1144,39 @@ public final class ICUResourceBundleTest extends TestFmwk { } catch (NoSuchElementException ex) { } } + + @Test + public void TestAlgorithmicParentFallback() { + // Test for ICU-21125 and ICU-21126 -- cases where resource fallback isn't determined by lopping fields off + // the end of the locale ID (or following a %%Parent directive in a resource bundle) + // first column is input locale, second column is expected output locale + String[][] testCases = { + { "de_Latn_LI", "de_LI", "de_LI" }, +// { "en_VA", "en_150", "en" }, // TODO: put this back in after https://unicode-org.atlassian.net/browse/CLDR-15893 is fixed + { "yi_Latn_DE", "", "yi" }, // "" is just "root"-- or, actually, the system default locale + { "yi_Hebr_DE", "yi", "yi" }, + { "zh_Hant_SG", "zh_Hant", "zh_Hant" }, + // would be nice to test that sr_Latn_ME falls back to sr_Latn, or sr_ME to sr_Latn_ME, + // or sr_Latn to root, but all of these resource bundle files actually exist in the project + }; + + for (String[] testCase : testCases) { + String localeID = testCase[0]; + String localeDefaultRootExpected = testCase[1]; + String localeRootExpected = testCase[2]; + + ULocale locale = new ULocale(localeID); + ICUResourceBundle localeDefaultRootRB = ICUResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, locale, ICUResourceBundle.OpenType.LOCALE_DEFAULT_ROOT); + ICUResourceBundle localeRootRB = ICUResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, locale, ICUResourceBundle.OpenType.LOCALE_ROOT); + String localeDefaultRootActual = localeDefaultRootRB.getULocale().toString(); + String localeRootActual = localeRootRB.getULocale().toString(); + + if (localeDefaultRootExpected.isEmpty()) { + assertEquals("Got wrong locale with LOCALE_DEFAULT_ROOT", ULocale.getDefault().toString(), localeDefaultRootActual); + } else { + assertEquals("Got wrong locale with LOCALE_DEFAULT_ROOT", localeDefaultRootExpected, localeDefaultRootActual); + } + assertEquals("Got wrong locale with LOCALE_ROOT", localeRootExpected, localeRootActual); + } + } } diff --git a/tools/cldr/cldr-to-icu/README.txt b/tools/cldr/cldr-to-icu/README.txt index 4b32b679ba6..689f2aadd00 100644 --- a/tools/cldr/cldr-to-icu/README.txt +++ b/tools/cldr/cldr-to-icu/README.txt @@ -58,8 +58,8 @@ $ cd "$TOOLS_ROOT/cldr/lib" $ ./install-cldr-jars.sh "$CLDR_DIR" -Generating all ICU data ------------------------ +Generating all ICU data and source code +--------------------------------------- $ cd "$TOOLS_ROOT/cldr/cldr-to-icu" $ ant -f build-icu-data.xml @@ -70,7 +70,7 @@ Other Examples * Outputting a subset of the supplemental data into a specified directory: - $ ant -f build-icu-data.xml -DoutDir=/tmp/cldr -DoutputTypes=plurals,dayPeriods + $ ant -f build-icu-data.xml -DoutDir=/tmp/cldr -DoutputTypes=plurals,dayPeriods -DdontGenCode=true Note: Output types can be listed with mixedCase, lower_underscore or UPPER_UNDERSCORE. Pass '-DoutputTypes=help' to see the full list. @@ -78,7 +78,7 @@ Other Examples * Outputting only a subset of locale IDs (and all the supplemental data): - $ ant -f build-icu-data.xml -DoutDir=/tmp/cldr -DlocaleIdFilter='(zh|yue).*' + $ ant -f build-icu-data.xml -DoutDir=/tmp/cldr -DlocaleIdFilter='(zh|yue).*' -DdontGenCode=true * Overriding the default CLDR version string (which normally matches the CLDR library code): diff --git a/tools/cldr/cldr-to-icu/build-icu-data.xml b/tools/cldr/cldr-to-icu/build-icu-data.xml index 4fc8c0001e3..0f9242ad892 100644 --- a/tools/cldr/cldr-to-icu/build-icu-data.xml +++ b/tools/cldr/cldr-to-icu/build-icu-data.xml @@ -13,7 +13,7 @@ - + @@ -43,6 +43,20 @@ so it is recommended that for testing, it be set to another value. --> + + + + + + + + + @@ -115,6 +129,9 @@ + + + @@ -135,6 +152,11 @@ + + + + + --> + + @@ -383,6 +407,11 @@ + + + + +