]> granicus.if.org Git - icu/commitdiff
ICU-20098 Fix BCP47 validity check for extlang and privateuse singleton (#102)
authorjungshik <jungshik@users.noreply.github.com>
Tue, 11 Sep 2018 06:45:14 +0000 (23:45 -0700)
committerShane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
* ICU-20098 Fix the validty check for extlang in uloc_forLanguageTag

BCP 47 has the following for language. extlang subtag can only be
preceded by 2*3ALPHA. Add a check for the length of language subtag
before extlang subtag.

language      = 2*3ALPHA            ; shortest ISO 639 code
                 ["-" extlang]       ; sometimes followed by
                                     ; extended language subtags
               / 4ALPHA              ; or reserved for future use
               / 5*8ALPHA            ; or registered language subtag

 extlang       = 3ALPHA              ; selected ISO 639 codes
                 *2("-" 3ALPHA)      ; permanently reserved}}

With this change, 'hant-cmn-CN' would drop '-cmn-CN' keeping only
'hant'.

* ICU-20098 Fix the validty check for extlang for ICU4J

* ICU-20098 Fix the compiler failure for ICU4J

* ICU-20098 Fix a compile error and test.

* ICU-20098 Add a test for invalid private use  singleton

ICU4C's check for private use singleton subtag ('x') is wrong and
treats invalid language tags as valid.

ICU4J's check is correct and does not require any change.

Fix that and add tests to both ICU4C and ICU4J.

icu4c/source/common/uloc_tag.cpp
icu4c/source/test/cintltst/cloctst.c
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java

index 87b9f63f279a826f5b275236f68d419598c5dd12..27a34057b2175691b7a824bf502fa310637e4ce0 100644 (file)
@@ -1901,7 +1901,9 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
                 t->language = T_CString_toLowerCase(pSubtag);
 
                 pLastGoodPosition = pSep;
-                next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
+                next = SCRT | REGN | VART | EXTS | PRIV;
+                if (subtagLen <= 3)
+                  next |= EXTL;
                 continue;
             }
         }
@@ -2035,7 +2037,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
             }
         }
         if (next & PRIV) {
-            if (uprv_tolower(*pSubtag) == PRIVATEUSE) {
+            if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
                 char *pPrivuseVal;
 
                 if (pExtension != NULL) {
index 0c9ce42f5d0d549b739c16717da0cab1fd48865a..43327ce5df5fc7537312aeb17416708d82ba2fd0 100644 (file)
@@ -6042,6 +6042,11 @@ static const struct {
     {"und-Latn-DE-u-em-emoji", "_Latn_DE@em=emoji", FULL_LENGTH},
     {"und-Zzzz-DE-u-em-emoji", "_Zzzz_DE@em=emoji", FULL_LENGTH},
     {"und-DE-u-em-emoji", "_DE@em=emoji", FULL_LENGTH},
+    // #20098
+    {"hant-cmn-cn", "hant", 4},
+    {"zh-cmn-TW", "cmn_TW", FULL_LENGTH},
+    {"zh-x_t-ab", "zh", 2},
+    {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes",  15},
     {NULL,          NULL,           0}
 };
 
index 786b4a00db97b4b66b3462c807b949aefc7172f0..0b3d532c0e5de12aeefc588054341f6d8dd7cb13 100644 (file)
@@ -181,7 +181,9 @@ public class LanguageTag {
 
         // langtag must start with either language or privateuse
         if (tag.parseLanguage(itr, sts)) {
-            tag.parseExtlangs(itr, sts);
+            // ExtLang can only be preceded by 2-3 letter language subtag.
+            if (tag._language.length() <= 3)
+                tag.parseExtlangs(itr, sts);
             tag.parseScript(itr, sts);
             tag.parseRegion(itr, sts);
             tag.parseVariants(itr, sts);
index bd20ce51006edfeea8fcede6fbadf4f500c3dc71..7829a93ae6bd5e362103822078fcc34f0ac843d4 100644 (file)
@@ -4151,7 +4151,11 @@ public class ULocaleTest extends TestFmwk {
                 {"en-u-baz-ca-islamic-civil",   "en@attribute=baz;calendar=islamic-civil",  NOERROR},
                 {"en-a-bar-u-ca-islamic-civil-x-u-foo", "en@a=bar;calendar=islamic-civil;x=u-foo",  NOERROR},
                 {"en-a-bar-u-baz-ca-islamic-civil-x-u-foo", "en@a=bar;attribute=baz;calendar=islamic-civil;x=u-foo",    NOERROR},
-
+                /* #20098 */
+                {"hant-cmn-cn", "hant", Integer.valueOf(5)},
+                {"zh-cmn-TW", "cmn_TW", NOERROR},
+                {"zh-x_t-ab", "zh", Integer.valueOf(3)},
+                {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes",  Integer.valueOf(16)},
         };
 
         for (int i = 0; i < langtag_to_locale.length; i++) {