From: Fredrik Roubert Date: Thu, 13 Sep 2018 03:41:53 +0000 (-0700) Subject: ICU-13417 Add the Locale::(for|to)LanguageTag() functions. X-Git-Tag: release-63-rc~73 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=56634121726e58f358ba9e0e1f0fcc33ccb9ca3d;p=icu ICU-13417 Add the Locale::(for|to)LanguageTag() functions. They are C++ wrappers of uloc_forLanguageTag() and uloc_toLanguageTag() respectively, that take care of dynamic memory management. --- diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index d799f5dc9cc..fe52afc9f31 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -32,8 +32,10 @@ */ +#include "unicode/bytestream.h" #include "unicode/locid.h" #include "unicode/strenum.h" +#include "unicode/stringpiece.h" #include "unicode/uloc.h" #include "putilimp.h" #include "mutex.h" @@ -711,6 +713,161 @@ Locale::setDefault( const Locale& newLocale, locale_set_default_internal(localeID, status); } +Locale U_EXPORT2 +Locale::forLanguageTag(StringPiece tag, UErrorCode& status) +{ + Locale result(Locale::eBOGUS); + + if (U_FAILURE(status)) { + return result; + } + + // TODO: Remove the need for a const char* to a NUL terminated buffer. + const CharString tag_nul(tag, status); + if (U_FAILURE(status)) { + return result; + } + + // If a BCP-47 language tag is passed as the language parameter to the + // normal Locale constructor, it will actually fall back to invoking + // uloc_forLanguageTag() to parse it if it somehow is able to detect that + // the string actually is BCP-47. This works well for things like strings + // using BCP-47 extensions, but it does not at all work for things like + // BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also + // interpret as ICU locale IDs and because of that won't trigger the BCP-47 + // parsing. Therefore the code here explicitly calls uloc_forLanguageTag() + // and then Locale::init(), instead of just calling the normal constructor. + + // All simple language tags will have the exact same length as ICU locale + // ID strings as they have as BCP-47 strings (like "en_US" for "en-US"). + CharString localeID; + int32_t resultCapacity = tag.size(); + + char* buffer; + int32_t parsedLength, reslen; + + for (;;) { + buffer = localeID.getAppendBuffer( + /*minCapacity=*/resultCapacity, + /*desiredCapacityHint=*/resultCapacity, + resultCapacity, + status); + + if (U_FAILURE(status)) { + return result; + } + + reslen = uloc_forLanguageTag( + tag_nul.data(), + buffer, + resultCapacity, + &parsedLength, + &status); + + if (status != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + // For all BCP-47 language tags that use extensions, the corresponding + // ICU locale ID will be longer but uloc_forLanguageTag() does compute + // the exact length needed so this memory reallocation will be done at + // most once. + resultCapacity = reslen; + status = U_ZERO_ERROR; + } + + if (U_FAILURE(status)) { + return result; + } + + if (parsedLength != tag.size()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return result; + } + + localeID.append(buffer, reslen, status); + if (status == U_STRING_NOT_TERMINATED_WARNING) { + status = U_ZERO_ERROR; // Terminators provided by CharString. + } + + if (U_FAILURE(status)) { + return result; + } + + result.init(localeID.data(), /*canonicalize=*/FALSE); + if (result.isBogus()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return result; +} + +void +Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const +{ + if (U_FAILURE(status)) { + return; + } + + if (fIsBogus) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + // All simple language tags will have the exact same length as BCP-47 + // strings as they have as ICU locale IDs (like "en-US" for "en_US"). + LocalMemory scratch; + int32_t scratch_capacity = uprv_strlen(fullName); + + if (scratch_capacity == 0) { + scratch_capacity = 3; // "und" + } + + char* buffer; + int32_t result_capacity, reslen; + + for (;;) { + if (scratch.allocateInsteadAndReset(scratch_capacity) == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + buffer = sink.GetAppendBuffer( + /*min_capacity=*/scratch_capacity, + /*desired_capacity_hint=*/scratch_capacity, + scratch.getAlias(), + scratch_capacity, + &result_capacity); + + reslen = uloc_toLanguageTag( + fullName, + buffer, + result_capacity, + /*strict=*/FALSE, + &status); + + if (status != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + // For some very few edge cases a language tag will be longer as a + // BCP-47 string than it is as an ICU locale ID. Most notoriously "C" + // expands to the BCP-47 tag "en-US-u-va-posix", 16 times longer, and + // it'll take several calls to uloc_toLanguageTag() to figure that out. + // https://unicode-org.atlassian.net/browse/ICU-20132 + scratch_capacity = reslen; + status = U_ZERO_ERROR; + } + + if (U_FAILURE(status)) { + return; + } + + sink.Append(buffer, reslen); + if (status == U_STRING_NOT_TERMINATED_WARNING) { + status = U_ZERO_ERROR; // Terminators not used. + } +} + Locale U_EXPORT2 Locale::createFromName (const char *name) { diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index 9ccf4715aff..ef8df739a00 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -31,6 +31,8 @@ #ifndef LOCID_H #define LOCID_H +#include "unicode/bytestream.h" +#include "unicode/stringpiece.h" #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/putil.h" @@ -362,6 +364,55 @@ public: UErrorCode& success); #endif /* U_HIDE_SYSTEM_API */ +#ifndef U_HIDE_DRAFT_API + /** + * Returns a Locale for the specified BCP47 language tag string. + * If the specified language tag contains any ill-formed subtags, + * the first such subtag and all following subtags are ignored. + *

+ * This implements the 'Language-Tag' production of BCP47, and so + * supports grandfathered (regular and irregular) as well as private + * use language tags. Private use tags are represented as 'x-whatever', + * and grandfathered tags are converted to their canonical replacements + * where they exist. Note that a few grandfathered tags have no modern + * replacement, these will be converted using the fallback described in + * the first paragraph, so some information might be lost. + * @param tag the input BCP47 language tag. + * @param status error information if creating the Locale failed. + * @return the Locale for the specified BCP47 language tag. + * @draft ICU 63 + */ + static Locale U_EXPORT2 forLanguageTag(StringPiece tag, UErrorCode& status); + + /** + * Returns a well-formed language tag for this Locale. + *

+ * Note: Any locale fields which do not satisfy the BCP47 syntax + * requirement will be silently omitted from the result. + * + * If this function fails, partial output may have been written to the sink. + * + * @param sink the output sink receiving the BCP47 language + * tag for this Locale. + * @param status error information if creating the language tag failed. + * @draft ICU 63 + */ + void toLanguageTag(ByteSink& sink, UErrorCode& status) const; + + /** + * Returns a well-formed language tag for this Locale. + *

+ * Note: Any locale fields which do not satisfy the BCP47 syntax + * requirement will be silently omitted from the result. + * + * @param status error information if creating the language tag failed. + * @return the BCP47 language tag for this Locale. + * @draft ICU 63 + */ + template + inline StringClass toLanguageTag(UErrorCode& status) const; +#endif // U_HIDE_DRAFT_API + /** * Creates a locale which has had minimal canonicalization * as per uloc_getName(). @@ -775,6 +826,17 @@ Locale::operator!=(const Locale& other) const return !operator==(other); } +#ifndef U_HIDE_DRAFT_API +template inline StringClass +Locale::toLanguageTag(UErrorCode& status) const +{ + StringClass result; + StringByteSink sink(&result); + toLanguageTag(sink, status); + return result; +} +#endif // U_HIDE_DRAFT_API + inline const char * Locale::getCountry() const { diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 9994e81e3ed..cd75670cff6 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -15,6 +15,7 @@ #include "unicode/brkiter.h" #include "unicode/coll.h" #include "unicode/ustring.h" +#include "unicode/std_string.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" @@ -233,6 +234,8 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c TESTCASE_AUTO(TestIsRightToLeft); TESTCASE_AUTO(TestBug13277); TESTCASE_AUTO(TestBug13554); + TESTCASE_AUTO(TestForLanguageTag); + TESTCASE_AUTO(TestToLanguageTag); TESTCASE_AUTO_END; } @@ -2748,4 +2751,97 @@ void LocaleTest::TestBug13554() { } } +void LocaleTest::TestForLanguageTag() { + IcuTestErrorCode status(*this, "TestForLanguageTag()"); + + static const char tag_en[] = "en-US"; + static const char tag_oed[] = "en-GB-oed"; + static const char tag_af[] = "af-t-ar-i0-handwrit-u-ca-coptic-x-foo"; + static const char tag_ill[] = "!"; + static const char tag_no_nul[] = { 'e', 'n', '-', 'G', 'B' }; + + static const Locale loc_en("en_US"); + static const Locale loc_oed("en_GB@x=oed"); + static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo"); + static const Locale loc_null(""); + static const Locale loc_gb("en_GB"); + + Locale result_en = Locale::forLanguageTag(tag_en, status); + status.errIfFailureAndReset("\"%s\"", tag_en); + assertEquals(tag_en, loc_en.getName(), result_en.getName()); + + Locale result_oed = Locale::forLanguageTag(tag_oed, status); + status.errIfFailureAndReset("\"%s\"", tag_oed); + assertEquals(tag_oed, loc_oed.getName(), result_oed.getName()); + + Locale result_af = Locale::forLanguageTag(tag_af, status); + status.errIfFailureAndReset("\"%s\"", tag_af); + assertEquals(tag_af, loc_af.getName(), result_af.getName()); + + Locale result_ill = Locale::forLanguageTag(tag_ill, status); + assertEquals(tag_ill, U_ILLEGAL_ARGUMENT_ERROR, status.reset()); + assertTrue(result_ill.getName(), result_ill.isBogus()); + + Locale result_null = Locale::forLanguageTag(nullptr, status); + status.errIfFailureAndReset("nullptr"); + assertEquals("nullptr", loc_null.getName(), result_null.getName()); + + StringPiece sp_substr(tag_oed, 5); // "en-GB", no NUL. + Locale result_substr = Locale::forLanguageTag(sp_substr, status); + status.errIfFailureAndReset("\"%.*s\"", sp_substr.size(), sp_substr.data()); + assertEquals(CharString(sp_substr, status).data(), + loc_gb.getName(), result_substr.getName()); + + StringPiece sp_no_nul(tag_no_nul, sizeof tag_no_nul); // "en-GB", no NUL. + Locale result_no_nul = Locale::forLanguageTag(sp_no_nul, status); + status.errIfFailureAndReset("\"%.*s\"", sp_no_nul.size(), sp_no_nul.data()); + assertEquals(CharString(sp_no_nul, status).data(), + loc_gb.getName(), result_no_nul.getName()); +} +void LocaleTest::TestToLanguageTag() { + IcuTestErrorCode status(*this, "TestToLanguageTag()"); + + static const Locale loc_c("C"); + static const Locale loc_en("en_US"); + static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo"); + static const Locale loc_empty(""); + static const Locale loc_ill("!"); + + static const char tag_c[] = "en-US-u-va-posix"; + static const char tag_en[] = "en-US"; + static const char tag_af[] = "af-t-ar-i0-handwrit-u-ca-coptic-x-foo"; + static const char tag_und[] = "und"; + + std::string result; + StringByteSink sink(&result); + loc_c.toLanguageTag(sink, status); + status.errIfFailureAndReset("\"%s\"", loc_c.getName()); + assertEquals(loc_c.getName(), tag_c, result.c_str()); + + std::string result_c = loc_c.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_c.getName()); + assertEquals(loc_c.getName(), tag_c, result_c.c_str()); + + std::string result_en = loc_en.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_en.getName()); + assertEquals(loc_en.getName(), tag_en, result_en.c_str()); + + std::string result_af = loc_af.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_af.getName()); + assertEquals(loc_af.getName(), tag_af, result_af.c_str()); + + std::string result_empty = loc_empty.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_empty.getName()); + assertEquals(loc_empty.getName(), tag_und, result_empty.c_str()); + + std::string result_ill = loc_ill.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", loc_ill.getName()); + assertEquals(loc_ill.getName(), tag_und, result_ill.c_str()); + + Locale loc_bogus; + loc_bogus.setToBogus(); + std::string result_bogus = loc_bogus.toLanguageTag(status); + assertEquals("bogus", U_ILLEGAL_ARGUMENT_ERROR, status.reset()); + assertTrue(result_bogus.c_str(), result_bogus.empty()); +} diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index 344e8816cc9..529df214df8 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -108,6 +108,9 @@ public: void TestBug13277(); void TestBug13554(); + void TestForLanguageTag(); + void TestToLanguageTag(); + private: void _checklocs(const char* label, const char* req,