From 94ff6b10248f4d31a4e77b9cbe3194600150b844 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Mon, 11 Feb 2019 13:47:07 -0800 Subject: [PATCH] ICU-20410 Fix grandfathered tag w/ extensions --- icu4c/source/common/uloc_tag.cpp | 27 ++++++++++++++----- icu4c/source/test/cintltst/cloctst.c | 4 +-- icu4c/source/test/cintltst/cstrcase.c | 7 +++-- icu4c/source/test/intltest/loctest.cpp | 27 +++++++++++++++++++ icu4c/source/test/intltest/loctest.h | 1 + .../com/ibm/icu/impl/locale/LanguageTag.java | 15 +++++++++-- .../ibm/icu/dev/test/util/ULocaleTest.java | 3 +++ 7 files changed, 72 insertions(+), 12 deletions(-) diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index 063efd45578..0e1743699ce 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -2063,13 +2063,26 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta return t.orphan(); } + size_t parsedLenDelta = 0; + // Grandfathered tag will be consider together. Grandfathered tag with intervening + // script and region such as art-DE-lojban or art-Latn-lojban won't be + // matched. /* check if the tag is grandfathered */ for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) { - if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) { + int32_t checkGrandfatheredLen = static_cast(uprv_strlen(GRANDFATHERED[i])); + if (tagLen < checkGrandfatheredLen) { + continue; + } + if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') { + // make sure next char is '-'. + continue; + } + if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) { int32_t newTagLength; - grandfatheredLen = tagLen; /* back up for output parsedLen */ - newTagLength = static_cast(uprv_strlen(GRANDFATHERED[i+1])); + grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */ + int32_t replacementLen = static_cast(uprv_strlen(GRANDFATHERED[i+1])); + newTagLength = replacementLen + tagLen - checkGrandfatheredLen; if (tagLen < newTagLength) { uprv_free(tagBuf); tagBuf = (char*)uprv_malloc(newTagLength + 1); @@ -2080,12 +2093,15 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta t->buf = tagBuf; tagLen = newTagLength; } + parsedLenDelta = checkGrandfatheredLen - replacementLen; uprv_strcpy(t->buf, GRANDFATHERED[i + 1]); + if (checkGrandfatheredLen != tagLen) { + uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen); + } break; } } - size_t parsedLenDelta = 0; if (grandfatheredLen == 0) { for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { const char* redundantTag = REDUNDANT[i]; @@ -2400,8 +2416,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta } if (parsedLen != NULL) { - *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : - (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta); + *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta); } return t.orphan(); diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 09de64c874e..5878fcc0f79 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -6160,8 +6160,8 @@ static const struct { /* #9562 IANA language tag data update */ {"en-gb-oed", "en_GB_OXENDICT", FULL_LENGTH}, {"i-navajo", "nv", FULL_LENGTH}, - {"i-navajo-a-foo", "", 0}, - {"i-navajo-latn-us", "", 0}, + {"i-navajo-a-foo", "nv@a=foo", FULL_LENGTH}, + {"i-navajo-latn-us", "nv_Latn_US", FULL_LENGTH}, {"sgn-br", "bzs", FULL_LENGTH}, {"sgn-br-u-co-phonebk", "bzs@collation=phonebook", FULL_LENGTH}, {"ja-latn-hepburn-heploc", "ja_Latn__ALALC97", FULL_LENGTH}, diff --git a/icu4c/source/test/cintltst/cstrcase.c b/icu4c/source/test/cintltst/cstrcase.c index 6fb2cfccffe..e526b54f4ab 100644 --- a/icu4c/source/test/cintltst/cstrcase.c +++ b/icu4c/source/test/cintltst/cstrcase.c @@ -748,9 +748,12 @@ TestUCaseMap(void) { /* overly long locale IDs may get truncated to their language code to avoid unnecessary allocation */ ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode); locale=ucasemap_getLocale(csm); - if(0!=strncmp(locale, "i-klingon", 9)) { + // "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog" is canonicalized + // into "tlh-the-quick-brown-fox-jumps-over-the-lazy-dog" + // and "the" will be treated as an extlang which replaces "tlh". + if(0!=strncmp(locale, "the", 3)) { log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s\n" - " does not start with \"i-klingon\"\n", locale); + " does not start with \"the\"\n", locale); } errorCode=U_ZERO_ERROR; diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index e3de596b2ec..e9ce47fbaa5 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -248,6 +248,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c TESTCASE_AUTO(TestIsRightToLeft); TESTCASE_AUTO(TestBug13277); TESTCASE_AUTO(TestBug13554); + TESTCASE_AUTO(TestBug20410); TESTCASE_AUTO(TestForLanguageTag); TESTCASE_AUTO(TestToLanguageTag); TESTCASE_AUTO(TestMoveAssign); @@ -2965,6 +2966,32 @@ void LocaleTest::TestBug13554() { } } +void LocaleTest::TestBug20410() { + IcuTestErrorCode status(*this, "TestBug20410()"); + + static const char tag1[] = "art-lojban-x-0"; + static const Locale expected1("jbo@x=0"); + Locale result1 = Locale::forLanguageTag(tag1, status); + status.errIfFailureAndReset("\"%s\"", tag1); + assertEquals(tag1, expected1.getName(), result1.getName()); + + static const char tag2[] = "zh-xiang-u-nu-thai-x-0"; + static const Locale expected2("hsn@numbers=thai;x=0"); + Locale result2 = Locale::forLanguageTag(tag2, status); + status.errIfFailureAndReset("\"%s\"", tag2); + assertEquals(tag2, expected2.getName(), result2.getName()); + + static const char locid3[] = "art__lojban@x=0"; + Locale result3 = Locale::createCanonical(locid3); + static const Locale expected3("art__LOJBAN@x=0"); + assertEquals(locid3, expected3.getName(), result3.getName()); + + static const char locid4[] = "art-lojban-x-0"; + Locale result4 = Locale::createCanonical(locid4); + static const Locale expected4("jbo@x=0"); + assertEquals(locid4, expected4.getName(), result4.getName()); +} + void LocaleTest::TestForLanguageTag() { IcuTestErrorCode status(*this, "TestForLanguageTag()"); diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index bebb26cebca..daf3baddc6b 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -114,6 +114,7 @@ public: void TestBug11421(); void TestBug13277(); void TestBug13554(); + void TestBug20410(); void TestAddLikelySubtags(); void TestMinimizeSubtags(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java index 2618b0ee7a4..d812ae2c6de 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java @@ -169,9 +169,20 @@ public class LanguageTag { // Check if the tag is grandfathered String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag)); + // Language tag is at least 2 alpha so we can skip searching the first 2 chars. + int dash = 2; + while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) { + gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash))); + } + if (gfmap != null) { - // use preferred mapping - itr = new StringTokenIterator(gfmap[1], SEP); + if (gfmap[0].length() == languageTag.length()) { + // use preferred mapping + itr = new StringTokenIterator(gfmap[1], SEP); + } else { + // append the rest of the tag. + itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP); + } isGrandfathered = true; } else { itr = new StringTokenIterator(languageTag, SEP); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index 0d5365727a6..4e42a6d4bb9 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -4187,6 +4187,9 @@ public class ULocaleTest extends TestFmwk { {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", NOERROR}, {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", NOERROR}, {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", NOERROR}, + /* #20410 */ + {"art-lojban-x-0", "jbo@x=0", NOERROR}, + {"zh-xiang-u-nu-thai-x-0", "hsn@numbers=thai;x=0", NOERROR}, }; for (int i = 0; i < langtag_to_locale.length; i++) { -- 2.40.0