ICU-20410 Fix grandfathered tag w/ extensions

author Frank Tang <ftang@chromium.org>

Mon, 11 Feb 2019 21:47:07 +0000 (13:47 -0800)

committer Frank Yung-Fong Tang <41213225+FrankYFTang@users.noreply.github.com>

Sat, 16 Feb 2019 00:51:31 +0000 (16:51 -0800)
author Frank Tang <ftang@chromium.org>
Mon, 11 Feb 2019 21:47:07 +0000 (13:47 -0800)
committer Frank Yung-Fong Tang <41213225+FrankYFTang@users.noreply.github.com>
Sat, 16 Feb 2019 00:51:31 +0000 (16:51 -0800)
diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp

index 063efd45578fae5a9c223f428ed71ccaca97db85..0e1743699ce438c6f76e863ca0d999ed71cbd587 100644 (file)
--- a/icu4c/source/common/uloc_tag.cpp
+++ b/icu4c/source/common/uloc_tag.cpp
@@ -2063,13 +2063,26 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
          return t.orphan();
      }
  
+    size_t parsedLenDelta = 0;
+    // Grandfathered tag will be consider together. Grandfathered tag with intervening
+    // script and region such as art-DE-lojban or art-Latn-lojban won't be
+    // matched.
      /* check if the tag is grandfathered */
      for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
-        if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) {
+        int32_t checkGrandfatheredLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i]));
+        if (tagLen < checkGrandfatheredLen) {
+            continue;
+        }
+        if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') {
+            // make sure next char is '-'.
+            continue;
+        }
+        if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) {
              int32_t newTagLength;
  
-            grandfatheredLen = tagLen;  /* back up for output parsedLen */
-            newTagLength = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
+            grandfatheredLen = checkGrandfatheredLen;  /* back up for output parsedLen */
+            int32_t replacementLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
+            newTagLength = replacementLen + tagLen - checkGrandfatheredLen;
              if (tagLen < newTagLength) {
                  uprv_free(tagBuf);
                  tagBuf = (char*)uprv_malloc(newTagLength + 1);
@@ -2080,12 +2093,15 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
                  t->buf = tagBuf;
                  tagLen = newTagLength;
              }
+            parsedLenDelta = checkGrandfatheredLen - replacementLen;
              uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
+            if (checkGrandfatheredLen != tagLen) {
+                uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen);
+            }
              break;
          }
      }
  
-    size_t parsedLenDelta = 0;
      if (grandfatheredLen == 0) {
          for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
              const char* redundantTag = REDUNDANT[i];
@@ -2400,8 +2416,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
      }
  
      if (parsedLen != NULL) {
-        *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen :
-            (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
+        *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
      }
  
      return t.orphan();
diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c

index 09de64c874eb682839a3843724e6ff2588e7168e..5878fcc0f79e0455612c44c6814a7634b1dfeeba 100644 (file)
--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@@ -6160,8 +6160,8 @@ static const struct {
      /* #9562 IANA language tag data update */
      {"en-gb-oed", "en_GB_OXENDICT", FULL_LENGTH},
      {"i-navajo", "nv", FULL_LENGTH},
-    {"i-navajo-a-foo", "", 0},
-    {"i-navajo-latn-us", "", 0},
+    {"i-navajo-a-foo", "nv@a=foo", FULL_LENGTH},
+    {"i-navajo-latn-us", "nv_Latn_US", FULL_LENGTH},
      {"sgn-br", "bzs", FULL_LENGTH},
      {"sgn-br-u-co-phonebk", "bzs@collation=phonebook", FULL_LENGTH},
      {"ja-latn-hepburn-heploc", "ja_Latn__ALALC97", FULL_LENGTH},
diff --git a/icu4c/source/test/cintltst/cstrcase.c b/icu4c/source/test/cintltst/cstrcase.c

index 6fb2cfccffe419a27867fa3cbfe3752f6f463418..e526b54f4ab31f0094ec8568beb2ed6677a0ec1d 100644 (file)
--- a/icu4c/source/test/cintltst/cstrcase.c
+++ b/icu4c/source/test/cintltst/cstrcase.c
@@ -748,9 +748,12 @@ TestUCaseMap(void) {
      /* overly long locale IDs may get truncated to their language code to avoid unnecessary allocation */
      ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode);
      locale=ucasemap_getLocale(csm);
-    if(0!=strncmp(locale, "i-klingon", 9)) {
+    // "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog" is canonicalized
+    // into "tlh-the-quick-brown-fox-jumps-over-the-lazy-dog"
+    // and "the" will be treated as an extlang which replaces "tlh".
+    if(0!=strncmp(locale, "the", 3)) {
          log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s\n"
-                "    does not start with \"i-klingon\"\n", locale);
+                "    does not start with \"the\"\n", locale);
      }
  
      errorCode=U_ZERO_ERROR;
diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp

index e3de596b2eca36554b9ace2d5bb5d92113d3a5ce..e9ce47fbaa584bfb294bcbbb0e709bd1dfe67b75 100644 (file)
--- a/icu4c/source/test/intltest/loctest.cpp
+++ b/icu4c/source/test/intltest/loctest.cpp
@@ -248,6 +248,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
      TESTCASE_AUTO(TestIsRightToLeft);
      TESTCASE_AUTO(TestBug13277);
      TESTCASE_AUTO(TestBug13554);
+    TESTCASE_AUTO(TestBug20410);
      TESTCASE_AUTO(TestForLanguageTag);
      TESTCASE_AUTO(TestToLanguageTag);
      TESTCASE_AUTO(TestMoveAssign);
@@ -2965,6 +2966,32 @@ void LocaleTest::TestBug13554() {
      }
  }
  
+void LocaleTest::TestBug20410() {
+    IcuTestErrorCode status(*this, "TestBug20410()");
+
+    static const char tag1[] = "art-lojban-x-0";
+    static const Locale expected1("jbo@x=0");
+    Locale result1 = Locale::forLanguageTag(tag1, status);
+    status.errIfFailureAndReset("\"%s\"", tag1);
+    assertEquals(tag1, expected1.getName(), result1.getName());
+
+    static const char tag2[] = "zh-xiang-u-nu-thai-x-0";
+    static const Locale expected2("hsn@numbers=thai;x=0");
+    Locale result2 = Locale::forLanguageTag(tag2, status);
+    status.errIfFailureAndReset("\"%s\"", tag2);
+    assertEquals(tag2, expected2.getName(), result2.getName());
+
+    static const char locid3[] = "art__lojban@x=0";
+    Locale result3 = Locale::createCanonical(locid3);
+    static const Locale expected3("art__LOJBAN@x=0");
+    assertEquals(locid3, expected3.getName(), result3.getName());
+
+    static const char locid4[] = "art-lojban-x-0";
+    Locale result4 = Locale::createCanonical(locid4);
+    static const Locale expected4("jbo@x=0");
+    assertEquals(locid4, expected4.getName(), result4.getName());
+}
+
  void LocaleTest::TestForLanguageTag() {
      IcuTestErrorCode status(*this, "TestForLanguageTag()");
  
diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h

index bebb26cebca4d1e7f307a015351a86acad5c3faf..daf3baddc6b388110b289fa289d27394f9ad0844 100644 (file)
--- a/icu4c/source/test/intltest/loctest.h
+++ b/icu4c/source/test/intltest/loctest.h
@@ -114,6 +114,7 @@ public:
      void TestBug11421();
      void TestBug13277();
      void TestBug13554();
+    void TestBug20410();
  
      void TestAddLikelySubtags();
      void TestMinimizeSubtags();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java

index 2618b0ee7a4927c175b7349e0cd62838da63d410..d812ae2c6de59a59a2c7c6b3eaf0e54e67fba864 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java
@@ -169,9 +169,20 @@ public class LanguageTag {
  
          // Check if the tag is grandfathered
          String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
+        // Language tag is at least 2 alpha so we can skip searching the first 2 chars.
+        int dash = 2;
+        while (gfmap == null && (dash = languageTag.indexOf('-', dash + 1)) != -1) {
+            gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag.substring(0, dash)));
+        }
+
          if (gfmap != null) {
-            // use preferred mapping
-            itr = new StringTokenIterator(gfmap[1], SEP);
+            if (gfmap[0].length() == languageTag.length()) {
+                // use preferred mapping
+                itr = new StringTokenIterator(gfmap[1], SEP);
+            } else {
+                // append the rest of the tag.
+                itr = new StringTokenIterator(gfmap[1] + languageTag.substring(dash), SEP);
+            }
              isGrandfathered = true;
          } else {
              itr = new StringTokenIterator(languageTag, SEP);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java

index 0d5365727a6972d0fdaf0cdb38a2dcafe5d0d55b..4e42a6d4bb92966d7d751c3542d72cb736466726 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java
@@ -4187,6 +4187,9 @@ public class ULocaleTest extends TestFmwk {
                  {"zh-u-ca-gregory-co-pinyin-ca-chinese", "zh@calendar=gregorian;collation=pinyin", NOERROR},
                  {"de-latn-DE-1901-u-co-phonebk-co-pinyin-ca-gregory", "de_Latn_DE_1901@calendar=gregorian;collation=phonebook", NOERROR},
                  {"th-u-kf-nu-thai-kf-false", "th@colcasefirst=yes;numbers=thai", NOERROR},
+                /* #20410 */
+                {"art-lojban-x-0", "jbo@x=0", NOERROR},
+                {"zh-xiang-u-nu-thai-x-0", "hsn@numbers=thai;x=0", NOERROR},
          };
  
          for (int i = 0; i < langtag_to_locale.length; i++) {
author	Frank Tang <ftang@chromium.org>
	Mon, 11 Feb 2019 21:47:07 +0000 (13:47 -0800)
committer	Frank Yung-Fong Tang <41213225+FrankYFTang@users.noreply.github.com>
	Sat, 16 Feb 2019 00:51:31 +0000 (16:51 -0800)
icu4c/source/common/uloc_tag.cpp		patch \| blob \| history
icu4c/source/test/cintltst/cloctst.c		patch \| blob \| history
icu4c/source/test/cintltst/cstrcase.c		patch \| blob \| history
icu4c/source/test/intltest/loctest.cpp		patch \| blob \| history
icu4c/source/test/intltest/loctest.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java		patch \| blob \| history