From: Fredrik Roubert Date: Thu, 13 Sep 2018 21:14:26 +0000 (-0700) Subject: ICU-13417 Add the Locale::(addLikely|minimize)Subtags() functions. X-Git-Tag: release-63-rc~67 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7c2a8d1fcc28b89cd5c5b58a67901b519d7a43ed;p=icu ICU-13417 Add the Locale::(addLikely|minimize)Subtags() functions. They are C++ wrappers of uloc_addLikelySubtags() and uloc_minimizeSubtags() respectively, that take care of dynamic memory management. --- diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index 19e4ac7f21b..d47f12edd51 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -714,6 +714,126 @@ Locale::setDefault( const Locale& newLocale, locale_set_default_internal(localeID, status); } +void +Locale::addLikelySubtags(UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + + // The maximized locale ID string is often longer, but there is no good + // heuristic to estimate just how much longer. Leave that to CharString. + CharString maximizedLocaleID; + int32_t maximizedLocaleIDCapacity = uprv_strlen(fullName); + + char* buffer; + int32_t reslen; + + for (;;) { + buffer = maximizedLocaleID.getAppendBuffer( + /*minCapacity=*/maximizedLocaleIDCapacity, + /*desiredCapacityHint=*/maximizedLocaleIDCapacity, + maximizedLocaleIDCapacity, + status); + + if (U_FAILURE(status)) { + return; + } + + reslen = uloc_addLikelySubtags( + fullName, + buffer, + maximizedLocaleIDCapacity, + &status); + + if (status != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + maximizedLocaleIDCapacity = reslen; + status = U_ZERO_ERROR; + } + + if (U_FAILURE(status)) { + return; + } + + maximizedLocaleID.append(buffer, reslen, status); + if (status == U_STRING_NOT_TERMINATED_WARNING) { + status = U_ZERO_ERROR; // Terminators provided by CharString. + } + + if (U_FAILURE(status)) { + return; + } + + init(maximizedLocaleID.data(), /*canonicalize=*/FALSE); + if (isBogus()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +void +Locale::minimizeSubtags(UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + + // Except for a few edge cases (like the empty string, that is minimized to + // "en__POSIX"), minimized locale ID strings will be either the same length + // or shorter than their input. + CharString minimizedLocaleID; + int32_t minimizedLocaleIDCapacity = uprv_strlen(fullName); + + char* buffer; + int32_t reslen; + + for (;;) { + buffer = minimizedLocaleID.getAppendBuffer( + /*minCapacity=*/minimizedLocaleIDCapacity, + /*desiredCapacityHint=*/minimizedLocaleIDCapacity, + minimizedLocaleIDCapacity, + status); + + if (U_FAILURE(status)) { + return; + } + + reslen = uloc_minimizeSubtags( + fullName, + buffer, + minimizedLocaleIDCapacity, + &status); + + if (status != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + // Because of the internal minimal buffer size of CharString, I can't + // think of any input data for which this could possibly ever happen. + // Maybe it would be better replaced with an assertion instead? + minimizedLocaleIDCapacity = reslen; + status = U_ZERO_ERROR; + } + + if (U_FAILURE(status)) { + return; + } + + minimizedLocaleID.append(buffer, reslen, status); + if (status == U_STRING_NOT_TERMINATED_WARNING) { + status = U_ZERO_ERROR; // Terminators provided by CharString. + } + + if (U_FAILURE(status)) { + return; + } + + init(minimizedLocaleID.data(), /*canonicalize=*/FALSE); + if (isBogus()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } +} + Locale U_EXPORT2 Locale::forLanguageTag(StringPiece tag, UErrorCode& status) { diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index ea48ed91a17..f570391b1bc 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -483,6 +483,69 @@ public: */ const char * getBaseName() const; +#ifndef U_HIDE_DRAFT_API + /** + * Add the likely subtags for this Locale, per the algorithm described + * in the following CLDR technical report: + * + * http://www.unicode.org/reports/tr35/#Likely_Subtags + * + * If this Locale is already in the maximal form, or not valid, or there is + * no data available for maximization, the Locale will be unchanged. + * + * For example, "und-Zzzz" cannot be maximized, since there is no + * reasonable maximization. + * + * Examples: + * + * "en" maximizes to "en_Latn_US" + * + * "de" maximizes to "de_Latn_US" + * + * "sr" maximizes to "sr_Cyrl_RS" + * + * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) + * + * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) + * + * @param status error information if maximizing this Locale failed. + * If this Locale is not well-formed, the error code is + * U_ILLEGAL_ARGUMENT_ERROR. + * @draft ICU 63 + */ + void addLikelySubtags(UErrorCode& status); + + /** + * Minimize the subtags for this Locale, per the algorithm described + * in the following CLDR technical report: + * + * http://www.unicode.org/reports/tr35/#Likely_Subtags + * + * If this Locale is already in the minimal form, or not valid, or there is + * no data available for minimization, the Locale will be unchanged. + * + * Since the minimization algorithm relies on proper maximization, see the + * comments for addLikelySubtags for reasons why there might not be any + * data. + * + * Examples: + * + * "en_Latn_US" minimizes to "en" + * + * "de_Latn_US" minimizes to "de" + * + * "sr_Cyrl_RS" minimizes to "sr" + * + * "zh_Hant_TW" minimizes to "zh_TW" (The region is preferred to the + * script, and minimizing to "zh" would imply "zh_Hans_CN".) + * + * @param status error information if maximizing this Locale failed. + * If this Locale is not well-formed, the error code is + * U_ILLEGAL_ARGUMENT_ERROR. + * @draft ICU 63 + */ + void minimizeSubtags(UErrorCode& status); +#endif // U_HIDE_DRAFT_API /** * Gets the list of keywords for the specified locale. diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index 55783ee04ed..441934da5ee 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -164,7 +164,6 @@ library: common ubidi ushape ubiditransform listformatter resourcebundle service_registration resbund_cnv ures_cnv icudataver ucat - loclikely currency locale_display_names2 conversion converter_selector ucnv_set ucnvdisp @@ -383,7 +382,7 @@ group: cstr group: uscript uscript.o # uscript_getCode() accepts a locale ID and loads its script code data deps - propname loclikely + propname resourcebundle group: uscript_props # script metadata properties uscript_props.o @@ -583,7 +582,7 @@ group: locale_display_names2 group: currency ucurr.o deps - loclikely resourcebundle ulist ustring_case_locale + resourcebundle ulist ustring_case_locale stdlib_qsort # for ucurr.o (which does not use ICU's uarrsort.o) static_unicode_sets usetiter @@ -592,11 +591,6 @@ group: icudataver # u_getDataVersion() deps resourcebundle -group: loclikely - loclikely.o - deps - resourcebundle uscript_props propname - group: locresdata # This was intended to collect locale functions that load resource bundle data. # See the resourcebundle group about what else loads data. @@ -631,9 +625,12 @@ group: resourcebundle locid.o locmap.o wintz.o # Do we need class LocaleBased? http://bugs.icu-project.org/trac/ticket/8608 locbased.o + loclikely.o deps udata ucol_swp sort stringenumeration uhash uvector + uscript_props propname + bytesinkutil group: udata udata.o ucmndata.o udatamem.o @@ -832,7 +829,6 @@ group: localedata deps uniset_props resourcebundle uset_props # TODO: change to using C++ UnicodeSet, remove this dependency - loclikely group: genderinfo gender.o diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 3f5756e731a..c86db1bed33 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -221,6 +221,8 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c #endif TESTCASE_AUTO(TestSetIsBogus); TESTCASE_AUTO(TestParallelAPIValues); + TESTCASE_AUTO(TestAddLikelySubtags); + TESTCASE_AUTO(TestMinimizeSubtags); TESTCASE_AUTO(TestKeywordVariants); TESTCASE_AUTO(TestCreateUnicodeKeywords); TESTCASE_AUTO(TestKeywordVariantParsing); @@ -1607,6 +1609,34 @@ LocaleTest::TestSetIsBogus() { } +void +LocaleTest::TestAddLikelySubtags() { + IcuTestErrorCode status(*this, "TestAddLikelySubtags()"); + + static const Locale min("sv"); + static const Locale max("sv_Latn_SE"); + + Locale result(min); + result.addLikelySubtags(status); + status.errIfFailureAndReset("\"%s\"", min.getName()); + assertEquals("addLikelySubtags", max.getName(), result.getName()); +} + + +void +LocaleTest::TestMinimizeSubtags() { + IcuTestErrorCode status(*this, "TestMinimizeSubtags()"); + + static const Locale max("zh_Hant_TW"); + static const Locale min("zh_TW"); + + Locale result(max); + result.minimizeSubtags(status); + status.errIfFailureAndReset("\"%s\"", max.getName()); + assertEquals("minimizeSubtags", min.getName(), result.getName()); +} + + void LocaleTest::TestKeywordVariants(void) { static const struct { diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index 06589161cff..e93b17df84a 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -113,6 +113,9 @@ public: void TestBug13277(); void TestBug13554(); + void TestAddLikelySubtags(); + void TestMinimizeSubtags(); + void TestForLanguageTag(); void TestToLanguageTag();