]> granicus.if.org Git - icu/commitdiff
ICU-20140 Allow duplicated keys in U-extension per RFC 6067 (#136)
authorjungshik <jungshik@users.noreply.github.com>
Tue, 18 Sep 2018 17:23:12 +0000 (10:23 -0700)
committerShane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:40 +0000 (14:27 -0700)
* ICU-20140 Allow duplicated keys in U-extension per RFC 6067

RFC 6067 [1] does allow duplicate keywords, but ICU4C's
uloc_forLanguageCode rejects it as invalid.

Change it to accept duplicate keywords and honor only the
1st one while ignoring subsequent ones per RFC 6067.

[1] Unicode extension to BCP 47:
    https://tools.ietf.org/html/rfc6067

* ICU-20140 Add ICU4J test and tweak ICU4C test

ICU4J test diverges from ICU4C tests:

1. Handling of duplicate variants in ICU4J seem to be wrong:
   https://unicode-org.atlassian.net/browse/ICU-20148
2. ULocale.forLanguageTag only throws NullPointException so
   that ICU4C's test for duplicate attributes cannot be ported.

icu4c/source/common/uloc_tag.cpp
icu4c/source/test/cintltst/cloctst.c
icu4c/source/test/cintltst/cloctst.h
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java

index 27a34057b2175691b7a824bf502fa310637e4ce0..84e06d40f5484e874bbff45365135ca67555f703 100644 (file)
@@ -1460,9 +1460,9 @@ _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendT
                     kwd->value = pType;
 
                     if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
-                        *status = U_ILLEGAL_ARGUMENT_ERROR;
+                        // duplicate keyword is allowed, Only the first
+                        // is honored.
                         uprv_free(kwd);
-                        goto cleanup;
                     }
                 }
 
index 43327ce5df5fc7537312aeb17416708d82ba2fd0..4454c67274d6d579ef9c26ad83bcf53fab7be082 100644 (file)
@@ -251,6 +251,7 @@ void addLocaleTest(TestNode** root)
     TESTCASE(TestLikelySubtags);
     TESTCASE(TestToLanguageTag);
     TESTCASE(TestForLanguageTag);
+    TESTCASE(TestInvalidLanguageTag);
     TESTCASE(TestTrailingNull);
     TESTCASE(TestUnicodeDefines);
     TESTCASE(TestEnglishExemplarCharacters);
@@ -6030,6 +6031,9 @@ static const struct {
     {"ja-u-ijkl-efgh-abcd-ca-japanese-xx-yyy-zzz-kn",   "ja@attribute=abcd-efgh-ijkl;calendar=japanese;colnumeric=yes;xx=yyy-zzz",  FULL_LENGTH},
     {"de-u-xc-xphonebk-co-phonebk-ca-buddhist-mo-very-lo-extensi-xd-that-de-should-vc-probably-xz-killthebuffer",
      "de@calendar=buddhist;collation=phonebook;de=should;lo=extensi;mo=very;vc=probably;xc=xphonebk;xd=that;xz=yes", 91},
+    {"de-1901-1901", "de__1901", 7},
+    {"de-DE-1901-1901", "de_DE_1901", 10},
+    {"en-a-bbb-a-ccc", "en@a=bbb", 8},
     /* #12761 */
     {"en-a-bar-u-baz",      "en@a=bar;attribute=baz",   FULL_LENGTH},
     {"en-a-bar-u-baz-x-u-foo",  "en@a=bar;attribute=baz;x=u-foo",   FULL_LENGTH},
@@ -6047,6 +6051,11 @@ static const struct {
     {"zh-cmn-TW", "cmn_TW", FULL_LENGTH},
     {"zh-x_t-ab", "zh", 2},
     {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes",  15},
+    /* #20140 dupe keys in U-extension */
+    {"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", FULL_LENGTH},
+    {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", FULL_LENGTH},
+    {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", FULL_LENGTH},
+    {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", FULL_LENGTH},
     {NULL,          NULL,           0}
 };
 
@@ -6081,6 +6090,35 @@ static void TestForLanguageTag(void) {
     }
 }
 
+/* See https://unicode-org.atlassian.net/browse/ICU-20149 .
+ * Depending on the resolution of that bug, this test may have
+ * to be revised.
+ */
+static void TestInvalidLanguageTag(void) {
+    static const char* invalid_lang_tags[] = {
+        "zh-u-foo-foo-co-pinyin", /* duplicate attribute in U extension */
+        "zh-cmn-hans-u-foo-foo-co-pinyin", /* duplicate attribute in U extension */
+#if 0
+        /*
+         * These do not lead to an error. Instead, parsing stops at the 1st
+         * invalid subtag.
+         */
+        "de-DE-1901-1901", /* duplicate variant */
+        "en-a-bbb-a-ccc", /* duplicate extension */
+#endif
+        NULL
+    };
+    char locale[256];
+    for (const char** tag = invalid_lang_tags; *tag != NULL; tag++) {
+        UErrorCode status = U_ZERO_ERROR;
+        uloc_forLanguageTag(*tag, locale, sizeof(locale), NULL, &status);
+        if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+            log_err("Error returned by uloc_forLanguageTag for input language tag [%s] : %s - expected error:  %s\n",
+                    *tag, u_errorName(status), u_errorName(U_ILLEGAL_ARGUMENT_ERROR));
+        }
+    }
+}
+
 static void TestToUnicodeLocaleKey(void)
 {
     /* $IN specifies the result should be the input pointer itself */
index b757328f5c03c1b3b6817db9349bfec34813f5e9..be1896a0c3ffcde0f881257a59c6c4857bb3b18b 100644 (file)
@@ -123,6 +123,7 @@ static void TestLikelySubtags(void);
  * lanuage tag
  */
 static void TestForLanguageTag(void);
+static void TestInvalidLanguageTag(void);
 static void TestToLanguageTag(void);
 
 static void TestToUnicodeLocaleKey(void);
index 7829a93ae6bd5e362103822078fcc34f0ac843d4..f651290ec38409abf9582140164bdcc68a99c2c7 100644 (file)
@@ -4156,6 +4156,11 @@ public class ULocaleTest extends TestFmwk {
                 {"zh-cmn-TW", "cmn_TW", NOERROR},
                 {"zh-x_t-ab", "zh", Integer.valueOf(3)},
                 {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes",  Integer.valueOf(16)},
+                /* #20140 dupe keys in U-extension */
+                {"zh-u-ca-chinese-ca-gregory", "zh@calendar=chinese", NOERROR},
+                {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", NOERROR},
+                {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", NOERROR},
+                {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", NOERROR},
         };
 
         for (int i = 0; i < langtag_to_locale.length; i++) {