From: jungshik Date: Tue, 11 Sep 2018 06:45:14 +0000 (-0700) Subject: ICU-20098 Fix BCP47 validity check for extlang and privateuse singleton (#102) X-Git-Tag: release-63-rc~79 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5cfdf4a519fd6aba1833eb7eccaba1939220669b;p=icu ICU-20098 Fix BCP47 validity check for extlang and privateuse singleton (#102) * ICU-20098 Fix the validty check for extlang in uloc_forLanguageTag BCP 47 has the following for language. extlang subtag can only be preceded by 2*3ALPHA. Add a check for the length of language subtag before extlang subtag. language = 2*3ALPHA ; shortest ISO 639 code ["-" extlang] ; sometimes followed by ; extended language subtags / 4ALPHA ; or reserved for future use / 5*8ALPHA ; or registered language subtag extlang = 3ALPHA ; selected ISO 639 codes *2("-" 3ALPHA) ; permanently reserved}} With this change, 'hant-cmn-CN' would drop '-cmn-CN' keeping only 'hant'. * ICU-20098 Fix the validty check for extlang for ICU4J * ICU-20098 Fix the compiler failure for ICU4J * ICU-20098 Fix a compile error and test. * ICU-20098 Add a test for invalid private use singleton ICU4C's check for private use singleton subtag ('x') is wrong and treats invalid language tags as valid. ICU4J's check is correct and does not require any change. Fix that and add tests to both ICU4C and ICU4J. --- diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 87b9f63f279..27a34057b21 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -1901,7 +1901,9 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta t->language = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; - next = EXTL | SCRT | REGN | VART | EXTS | PRIV; + next = SCRT | REGN | VART | EXTS | PRIV; + if (subtagLen <= 3) + next |= EXTL; continue; } } @@ -2035,7 +2037,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } } if (next & PRIV) { - if (uprv_tolower(*pSubtag) == PRIVATEUSE) { + if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) { char *pPrivuseVal; if (pExtension != NULL) { diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 0c9ce42f5d0..43327ce5df5 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -6042,6 +6042,11 @@ static const struct { {"und-Latn-DE-u-em-emoji", "_Latn_DE@em=emoji", FULL_LENGTH}, {"und-Zzzz-DE-u-em-emoji", "_Zzzz_DE@em=emoji", FULL_LENGTH}, {"und-DE-u-em-emoji", "_DE@em=emoji", FULL_LENGTH}, + // #20098 + {"hant-cmn-cn", "hant", 4}, + {"zh-cmn-TW", "cmn_TW", FULL_LENGTH}, + {"zh-x_t-ab", "zh", 2}, + {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", 15}, {NULL, NULL, 0} }; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java index 786b4a00db9..0b3d532c0e5 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java @@ -181,7 +181,9 @@ public class LanguageTag { // langtag must start with either language or privateuse if (tag.parseLanguage(itr, sts)) { - tag.parseExtlangs(itr, sts); + // ExtLang can only be preceded by 2-3 letter language subtag. + if (tag._language.length() <= 3) + tag.parseExtlangs(itr, sts); tag.parseScript(itr, sts); tag.parseRegion(itr, sts); tag.parseVariants(itr, sts); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index bd20ce51006..7829a93ae6b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -4151,7 +4151,11 @@ public class ULocaleTest extends TestFmwk { {"en-u-baz-ca-islamic-civil", "en@attribute=baz;calendar=islamic-civil", NOERROR}, {"en-a-bar-u-ca-islamic-civil-x-u-foo", "en@a=bar;calendar=islamic-civil;x=u-foo", NOERROR}, {"en-a-bar-u-baz-ca-islamic-civil-x-u-foo", "en@a=bar;attribute=baz;calendar=islamic-civil;x=u-foo", NOERROR}, - + /* #20098 */ + {"hant-cmn-cn", "hant", Integer.valueOf(5)}, + {"zh-cmn-TW", "cmn_TW", NOERROR}, + {"zh-x_t-ab", "zh", Integer.valueOf(3)}, + {"zh-hans-cn-u-ca-x_t-u", "zh_Hans_CN@calendar=yes", Integer.valueOf(16)}, }; for (int i = 0; i < langtag_to_locale.length; i++) {