From: Fredrik Roubert Date: Thu, 21 Feb 2019 23:06:15 +0000 (+0100) Subject: ICU-20447 fix uloc_getName(x) same as Locale(x).getName() etc. for ""/"und"/"root" X-Git-Tag: release-64-rc~23 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c3abe48e1ce31f5bf073c5dd853ac6a826b83927;p=icu ICU-20447 fix uloc_getName(x) same as Locale(x).getName() etc. for ""/"und"/"root" --- diff --git a/icu4c/source/common/locdispnames.cpp b/icu4c/source/common/locdispnames.cpp index 8fbee32eea9..d92348e31c8 100644 --- a/icu4c/source/common/locdispnames.cpp +++ b/icu4c/source/common/locdispnames.cpp @@ -26,7 +26,6 @@ #include "unicode/uloc.h" #include "unicode/ures.h" #include "unicode/ustring.h" -#include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" @@ -375,7 +374,12 @@ _getDisplayNameForComponent(const char *locale, return 0; } if(length==0) { - return u_terminateUChars(dest, destCapacity, 0, pErrorCode); + // For the display name, we treat this as unknown language (ICU-20273). + if (getter == uloc_getLanguage) { + uprv_strcpy(localeBuffer, "und"); + } else { + return u_terminateUChars(dest, destCapacity, 0, pErrorCode); + } } root = tag == _kCountries ? U_ICUDATA_REGION : U_ICUDATA_LANG; @@ -507,22 +511,6 @@ uloc_getDisplayName(const char *locale, return 0; } - // For the display name, we treat this as unknown language (ICU-20273). - static const char UND[] = "und"; - CharString und; - if (locale != NULL) { - if (*locale == '\0') { - locale = UND; - } else if (*locale == '_') { - und.append(UND, *pErrorCode); - und.append(locale, *pErrorCode); - if (U_FAILURE(*pErrorCode)) { - return 0; - } - locale = und.data(); - } - } - { UErrorCode status = U_ZERO_ERROR; diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index 97b6e5aa586..a6a518201c2 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -626,19 +626,6 @@ Locale& Locale::init(const char* localeID, UBool canonicalize) variantBegin = (int32_t)(field[variantField] - fullName); } - if (length == 4 && uprv_stricmp(fullName, "root") == 0) { - length = 0; - variantBegin = 0; - language[0] = '\0'; - fullName[0] = '\0'; - } else if (length >= 3 && uprv_strnicmp(fullName, "und", 3) == 0 && - (length == 3 || fullName[3] == '_' || fullName[3] == '@')) { - length -= 3; - variantBegin -= 3; - language[0] = '\0'; - uprv_memmove(fullName, fullName + 3, length + 1); - } - err = U_ZERO_ERROR; initBaseName(err); if (U_FAILURE(err)) { diff --git a/icu4c/source/common/loclikely.cpp b/icu4c/source/common/loclikely.cpp index 107316def17..843cd8f391b 100644 --- a/icu4c/source/common/loclikely.cpp +++ b/icu4c/source/common/loclikely.cpp @@ -34,6 +34,13 @@ #include "ulocimp.h" #include "ustr_imp.h" +/** + * These are the canonical strings for unknown languages, scripts and regions. + **/ +static const char* const unknownLanguage = "und"; +static const char* const unknownScript = "Zzzz"; +static const char* const unknownRegion = "ZZ"; + /** * This function looks for the localeID in the likelySubtags resource. * @@ -55,6 +62,19 @@ findLikelySubtags(const char* localeID, UErrorCode tmpErr = U_ZERO_ERROR; icu::LocalUResourceBundlePointer subtags(ures_openDirect(NULL, "likelySubtags", &tmpErr)); if (U_SUCCESS(tmpErr)) { + icu::CharString und; + if (localeID != NULL) { + if (*localeID == '\0') { + localeID = unknownLanguage; + } else if (*localeID == '_') { + und.append(unknownLanguage, *err); + und.append(localeID, *err); + if (U_FAILURE(*err)) { + return NULL; + } + localeID = und.data(); + } + } s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr); if (U_FAILURE(tmpErr)) { @@ -72,6 +92,11 @@ findLikelySubtags(const char* localeID, } else { u_UCharsToChars(s, buffer, resLen + 1); + if (resLen >= 3 && + uprv_strnicmp(buffer, unknownLanguage, 3) == 0 && + (resLen == 3 || buffer[3] == '_')) { + uprv_memmove(buffer, buffer + 3, resLen - 3 + 1); + } result = buffer; } } else { @@ -97,9 +122,10 @@ appendTag( const char* tag, int32_t tagLength, char* buffer, - int32_t* bufferLength) { + int32_t* bufferLength, + UBool withSeparator) { - if (*bufferLength > 0) { + if (withSeparator) { buffer[*bufferLength] = '_'; ++(*bufferLength); } @@ -112,13 +138,6 @@ appendTag( *bufferLength += tagLength; } -/** - * These are the canonical strings for unknown languages, scripts and regions. - **/ -static const char* const unknownLanguage = "und"; -static const char* const unknownScript = "Zzzz"; -static const char* const unknownRegion = "ZZ"; - /** * Create a tag string from the supplied parameters. The lang, script and region * parameters may be NULL pointers. If they are, their corresponding length parameters @@ -189,18 +208,14 @@ createTagStringWithAlternates( lang, langLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/FALSE); } else if (alternateTags == NULL) { /* - * Append the value for an unknown language, if + * Use the empty string for an unknown language, if * we found no language. */ - appendTag( - unknownLanguage, - (int32_t)uprv_strlen(unknownLanguage), - tagBuffer, - &tagLength); } else { /* @@ -221,21 +236,17 @@ createTagStringWithAlternates( } else if (alternateLangLength == 0) { /* - * Append the value for an unknown language, if + * Use the empty string for an unknown language, if * we found no language. */ - appendTag( - unknownLanguage, - (int32_t)uprv_strlen(unknownLanguage), - tagBuffer, - &tagLength); } else { appendTag( alternateLang, alternateLangLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/FALSE); } } @@ -244,7 +255,8 @@ createTagStringWithAlternates( script, scriptLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/TRUE); } else if (alternateTags != NULL) { /* @@ -268,7 +280,8 @@ createTagStringWithAlternates( alternateScript, alternateScriptLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/TRUE); } } @@ -277,7 +290,8 @@ createTagStringWithAlternates( region, regionLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/TRUE); regionAppended = TRUE; } @@ -302,7 +316,8 @@ createTagStringWithAlternates( alternateRegion, alternateRegionLength, tagBuffer, - &tagLength); + &tagLength, + /*withSeparator=*/TRUE); regionAppended = TRUE; } @@ -464,15 +479,9 @@ parseTagString( *langLength = subtagLength; /* - * If no language was present, use the value of unknownLanguage - * instead. Otherwise, move past any separator. + * If no language was present, use the empty string instead. + * Otherwise, move past any separator. */ - if (*langLength == 0) { - uprv_strcpy( - lang, - unknownLanguage); - *langLength = (int32_t)uprv_strlen(lang); - } if (_isIDSeparator(*position)) { ++position; } @@ -1003,7 +1012,7 @@ _uloc_minimizeSubtags(const char* localeID, if(U_FAILURE(*err)) { goto error; } - else if (uprv_strnicmp( + else if (!tagBuffer.isEmpty() && uprv_strnicmp( maximizedTagBuffer.data(), tagBuffer.data(), tagBuffer.length()) == 0) { diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp index a057491742e..73b43204b81 100644 --- a/icu4c/source/common/uloc.cpp +++ b/icu4c/source/common/uloc.cpp @@ -1180,6 +1180,16 @@ ulocimp_getLanguage(const char *localeID, int32_t offset; char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */ + if (uprv_stricmp(localeID, "root") == 0) { + localeID += 4; + } else if (uprv_strnicmp(localeID, "und", 3) == 0 && + (localeID[3] == '\0' || + localeID[3] == '-' || + localeID[3] == '_' || + localeID[3] == '@')) { + localeID += 3; + } + /* if it starts with i- or x- then copy that prefix */ if(_isIDPrefix(localeID)) { if(i0 && parent != localeID) { - uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity)); + if (i > 0) { + if (uprv_strnicmp(localeID, "und_", 4) == 0) { + localeID += 3; + i -= 3; + uprv_memmove(parent, localeID, uprv_min(i, parentCapacity)); + } else if (parent != localeID) { + uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity)); + } } + return u_terminateChars(parent, parentCapacity, i, err); } diff --git a/icu4c/source/i18n/collationruleparser.cpp b/icu4c/source/i18n/collationruleparser.cpp index 96dcc0d940b..ade6ecb552a 100644 --- a/icu4c/source/i18n/collationruleparser.cpp +++ b/icu4c/source/i18n/collationruleparser.cpp @@ -622,8 +622,11 @@ CollationRuleParser::parseSetting(UErrorCode &errorCode) { setParseError("expected language tag in [import langTag]", errorCode); return; } - if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) { + if(length == 0) { uprv_strcpy(baseID, "root"); + } else if(*baseID == '_') { + uprv_memmove(baseID + 3, baseID, length + 1); + uprv_memcpy(baseID, "und", 3); } // @collation=type, or length=0 if not specified char collationType[ULOC_KEYWORDS_CAPACITY]; diff --git a/icu4c/source/i18n/rulebasedcollator.cpp b/icu4c/source/i18n/rulebasedcollator.cpp index b057b6bbd5a..92fa5385971 100644 --- a/icu4c/source/i18n/rulebasedcollator.cpp +++ b/icu4c/source/i18n/rulebasedcollator.cpp @@ -1554,11 +1554,7 @@ RuleBasedCollator::internalGetShortDefinitionString(const char *locale, "collation", locale, NULL, &errorCode); if(U_FAILURE(errorCode)) { return 0; } - if(length == 0) { - uprv_strcpy(resultLocale, "root"); - } else { - resultLocale[length] = 0; - } + resultLocale[length] = 0; // Append items in alphabetic order of their short definition letters. CharString result; @@ -1585,7 +1581,11 @@ RuleBasedCollator::internalGetShortDefinitionString(const char *locale, length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode); appendSubtag(result, 'K', subtag, length, errorCode); length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); - appendSubtag(result, 'L', subtag, length, errorCode); + if (length == 0) { + appendSubtag(result, 'L', "root", 4, errorCode); + } else { + appendSubtag(result, 'L', subtag, length, errorCode); + } if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) { appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode); } diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 5878fcc0f79..4809f5cb8e6 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -3563,7 +3563,7 @@ const char* const basic_minimize_data[][2] = { "de_Latn_DE_POSIX_1901", "de__POSIX_1901" }, { - "und", + "", "" }, { "en_Latn_US@calendar=gregorian", @@ -5098,8 +5098,8 @@ const char* const full_data[][3] = { "zh_HK" }, { "und_AQ", - "und_Latn_AQ", - "und_AQ" + "_Latn_AQ", + "_AQ" }, { "und_Zzzz", "en_Latn_US", @@ -5122,8 +5122,8 @@ const char* const full_data[][3] = { "zh_HK" }, { "und_Zzzz_AQ", - "und_Latn_AQ", - "und_AQ" + "_Latn_AQ", + "_AQ" }, { "und_Latn", "en_Latn_US", @@ -5146,8 +5146,8 @@ const char* const full_data[][3] = { "zh_Latn_HK" }, { "und_Latn_AQ", - "und_Latn_AQ", - "und_AQ" + "_Latn_AQ", + "_AQ" }, { "und_Hans", "zh_Hans_CN", @@ -5218,8 +5218,8 @@ const char* const full_data[][3] = { "zh_Moon_HK" }, { "und_Moon_AQ", - "und_Moon_AQ", - "und_Moon_AQ" + "_Moon_AQ", + "_Moon_AQ" }, { "es", "es_Latn_ES", @@ -6520,7 +6520,7 @@ typedef struct { } BadLocaleItem; static const BadLocaleItem badLocaleItems[] = { - { "-9223372036854775808", "en", "9223372036854775808", U_USING_DEFAULT_WARNING }, + { "-9223372036854775808", "en", "Unknown language (9223372036854775808)", U_USING_DEFAULT_WARNING }, /* add more in the future */ { NULL, NULL, NULL, U_ZERO_ERROR } /* terminator */ }; diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 5927f7202b8..8db52c6b48d 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -261,6 +261,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c TESTCASE_AUTO(TestUnd); TESTCASE_AUTO(TestUndScript); TESTCASE_AUTO(TestUndRegion); + TESTCASE_AUTO(TestUndCAPI); TESTCASE_AUTO_END; } @@ -3544,3 +3545,196 @@ void LocaleTest::TestUndRegion() { assertEquals("getDisplayName()", displayName, locale_tag.getDisplayName(displayLocale, tmp)); assertEquals("getDisplayName()", displayName, locale_build.getDisplayName(displayLocale, tmp)); } + +void LocaleTest::TestUndCAPI() { + IcuTestErrorCode status(*this, "TestUndCAPI()"); + + static const char empty[] = ""; + static const char root[] = "root"; + static const char und[] = "und"; + + static const char empty_script[] = "_Cyrl"; + static const char empty_region[] = "_AQ"; + + static const char und_script[] = "und_Cyrl"; + static const char und_region[] = "und_AQ"; + + char tmp[ULOC_FULLNAME_CAPACITY]; + int32_t reslen; + + // uloc_getName() + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(empty, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(root, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", root); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(und, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(empty_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty_script, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(empty_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty_region, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(und_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty_script, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getName(und_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getName()", empty_region, tmp); + + // uloc_getBaseName() + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(empty, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(root, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", root); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(und, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(empty_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty_script, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(empty_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty_region, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(und_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty_script, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getBaseName(und_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getBaseName()", empty_region, tmp); + + // uloc_getParent() + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(empty, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(root, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", root); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(und, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(empty_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(empty_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(und_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getParent(und_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getParent()", empty, tmp); + + // uloc_getLanguage() + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(empty, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(root, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", root); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(und, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(empty_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(empty_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", empty_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(und_script, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_script); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); + + uprv_memset(tmp, '!', sizeof tmp); + reslen = uloc_getLanguage(und_region, tmp, sizeof tmp, status); + status.errIfFailureAndReset("\"%s\"", und_region); + assertTrue("reslen >= 0", reslen >= 0); + assertEquals("uloc_getLanguage()", empty, tmp); +} diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index 5ccdea641df..85da54fee5b 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -135,6 +135,7 @@ public: void TestUnd(); void TestUndScript(); void TestUndRegion(); + void TestUndCAPI(); private: void _checklocs(const char* label,