From: jungshik Date: Tue, 18 Sep 2018 17:23:12 +0000 (-0700) Subject: ICU-20140 Allow duplicated keys in U-extension per RFC 6067 (#136) X-Git-Tag: release-63-rc~53 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4ed35d7bb53252b5daf34ecde285ff8b47e11ec8;p=icu ICU-20140 Allow duplicated keys in U-extension per RFC 6067 (#136) * ICU-20140 Allow duplicated keys in U-extension per RFC 6067 RFC 6067 [1] does allow duplicate keywords, but ICU4C's uloc_forLanguageCode rejects it as invalid. Change it to accept duplicate keywords and honor only the 1st one while ignoring subsequent ones per RFC 6067. [1] Unicode extension to BCP 47: https://tools.ietf.org/html/rfc6067 * ICU-20140 Add ICU4J test and tweak ICU4C test ICU4J test diverges from ICU4C tests: 1. Handling of duplicate variants in ICU4J seem to be wrong: https://unicode-org.atlassian.net/browse/ICU-20148 2. ULocale.forLanguageTag only throws NullPointException so that ICU4C's test for duplicate attributes cannot be ported. --- diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 27a34057b21..84e06d40f54 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -1460,9 +1460,9 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT kwd->value = pType; if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) { - *status = U_ILLEGAL_ARGUMENT_ERROR; + // duplicate keyword is allowed, Only the first + // is honored. uprv_free(kwd); - goto cleanup; } } diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 43327ce5df5..4454c67274d 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -251,6 +251,7 @@ void addLocaleTest(TestNode** root) TESTCASE(TestLikelySubtags); TESTCASE(TestToLanguageTag); TESTCASE(TestForLanguageTag); + TESTCASE(TestInvalidLanguageTag); TESTCASE(TestTrailingNull); TESTCASE(TestUnicodeDefines); TESTCASE(TestEnglishExemplarCharacters); @@ -6030,6 +6031,9 @@ static const struct { {"ja-u-ijkl-efgh-abcd-ca-japanese-xx-yyy-zzz-kn", "ja@attribute=abcd-efgh-ijkl;calendar=japanese;colnumeric=yes;xx=yyy-zzz", FULL_LENGTH}, {"de-u-xc-xphonebk-co-phonebk-ca-buddhist-mo-very-lo-extensi-xd-that-de-should-vc-probably-xz-killthebuffer", "de@calendar=buddhist;collation=phonebook;de=should;lo=extensi;mo=very;vc=probably;xc=xphonebk;xd=that;xz=yes", 91}, + {"de-1901-1901", "de__1901", 7}, + {"de-DE-1901-1901", "de_DE_1901", 10}, + {"en-a-bbb-a-ccc", "en@a=bbb", 8}, /* #12761 */ {"en-a-bar-u-baz", "en@a=bar;attribute=baz", FULL_LENGTH}, {"en-a-bar-u-baz-x-u-foo", "en@a=bar;attribute=baz;x=u-foo", FULL_LENGTH}, @@ -6047,6 +6051,11 @@ static const struct { {"zh-cmn-TW", "cmn_TW", FULL_LENGTH}, {"zh-x_t-ab", "zh", 2}, {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", 15}, + /* #20140 dupe keys in U-extension */ + {"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", FULL_LENGTH}, + {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH}, + {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", FULL_LENGTH}, + {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", FULL_LENGTH}, {NULL, NULL, 0} }; @@ -6081,6 +6090,35 @@ static void TestForLanguageTag(void) { } } +/* See https://unicode-org.atlassian.net/browse/ICU-20149 . + * Depending on the resolution of that bug, this test may have + * to be revised. + */ +static void TestInvalidLanguageTag(void) { + static const char* invalid_lang_tags[] = { + "zh-u-foo-foo-co-pinyin", /* duplicate attribute in U extension */ + "zh-cmn-hans-u-foo-foo-co-pinyin", /* duplicate attribute in U extension */ +#if 0 + /* + * These do not lead to an error. Instead, parsing stops at the 1st + * invalid subtag. + */ + "de-DE-1901-1901", /* duplicate variant */ + "en-a-bbb-a-ccc", /* duplicate extension */ +#endif + NULL + }; + char locale[256]; + for (const char** tag = invalid_lang_tags; *tag != NULL; tag++) { + UErrorCode status = U_ZERO_ERROR; + uloc_forLanguageTag(*tag, locale, sizeof(locale), NULL, &status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("Error returned by uloc_forLanguageTag for input language tag [%s] : %s - expected error: %s\n", + *tag, u_errorName(status), u_errorName(U_ILLEGAL_ARGUMENT_ERROR)); + } + } +} + static void TestToUnicodeLocaleKey(void) { /* $IN specifies the result should be the input pointer itself */ diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index b757328f5c0..be1896a0c3f 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -123,6 +123,7 @@ static void TestLikelySubtags(void); * lanuage tag */ static void TestForLanguageTag(void); +static void TestInvalidLanguageTag(void); static void TestToLanguageTag(void); static void TestToUnicodeLocaleKey(void); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index 7829a93ae6b..f651290ec38 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -4156,6 +4156,11 @@ public class ULocaleTest extends TestFmwk { {"zh-cmn-TW", "cmn_TW", NOERROR}, {"zh-x_t-ab", "zh", Integer.valueOf(3)}, {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", Integer.valueOf(16)}, + /* #20140 dupe keys in U-extension */ + {"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", NOERROR}, + {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", NOERROR}, + {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", NOERROR}, + {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", NOERROR}, }; for (int i = 0; i < langtag_to_locale.length; i++) {