--- /dev/null
+// © 2019 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include <utility>
+
+#include "bytesinkutil.h" // CharStringByteSink
+#include "charstr.h"
+#include "cstring.h"
+#include "ulocimp.h"
+#include "unicode/localebuilder.h"
+#include "unicode/locid.h"
+
+U_NAMESPACE_BEGIN
+
+#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
+#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
+
+const char* kAttributeKey = "attribute";
+
+static bool _isExtensionSubtags(char key, const char* s, int32_t len) {
+ switch (uprv_tolower(key)) {
+ case 'u':
+ return ultag_isUnicodeExtensionSubtags(s, len);
+ case 't':
+ return ultag_isTransformedExtensionSubtags(s, len);
+ case 'x':
+ return ultag_isPrivateuseValueSubtags(s, len);
+ default:
+ return ultag_isExtensionSubtags(s, len);
+ }
+}
+
+LocaleBuilder::LocaleBuilder() : UObject(), status_(U_ZERO_ERROR), language_(),
+ script_(), region_(), variant_(nullptr), extensions_(nullptr)
+{
+ language_[0] = 0;
+ script_[0] = 0;
+ region_[0] = 0;
+}
+
+LocaleBuilder::~LocaleBuilder()
+{
+ delete variant_;
+ delete extensions_;
+}
+
+LocaleBuilder& LocaleBuilder::setLocale(const Locale& locale)
+{
+ clear();
+ setLanguage(locale.getLanguage());
+ setScript(locale.getScript());
+ setRegion(locale.getCountry());
+ setVariant(locale.getVariant());
+ extensions_ = locale.clone();
+ if (extensions_ == nullptr) {
+ status_ = U_MEMORY_ALLOCATION_ERROR;
+ }
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::setLanguageTag(StringPiece tag)
+{
+ Locale l = Locale::forLanguageTag(tag, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ // Because setLocale will reset status_ we need to return
+ // first if we have error in forLanguageTag.
+ setLocale(l);
+ return *this;
+}
+
+static void setField(StringPiece input, char* dest, UErrorCode& errorCode,
+ UBool (*test)(const char*, int32_t)) {
+ if (U_FAILURE(errorCode)) { return; }
+ if (input.empty()) {
+ dest[0] = '\0';
+ } else if (test(input.data(), input.length())) {
+ uprv_memcpy(dest, input.data(), input.length());
+ dest[input.length()] = '\0';
+ } else {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+}
+
+LocaleBuilder& LocaleBuilder::setLanguage(StringPiece language)
+{
+ setField(language, language_, status_, &ultag_isLanguageSubtag);
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::setScript(StringPiece script)
+{
+ setField(script, script_, status_, &ultag_isScriptSubtag);
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::setRegion(StringPiece region)
+{
+ setField(region, region_, status_, &ultag_isRegionSubtag);
+ return *this;
+}
+
+static void transform(char* data, int32_t len) {
+ for (int32_t i = 0; i < len; i++, data++) {
+ if (*data == '_') {
+ *data = '-';
+ } else {
+ *data = uprv_tolower(*data);
+ }
+ }
+}
+
+LocaleBuilder& LocaleBuilder::setVariant(StringPiece variant)
+{
+ if (U_FAILURE(status_)) { return *this; }
+ if (variant.empty()) {
+ delete variant_;
+ variant_ = nullptr;
+ return *this;
+ }
+ CharString* new_variant = new CharString(variant, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ if (new_variant == nullptr) {
+ status_ = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ transform(new_variant->data(), new_variant->length());
+ if (!ultag_isVariantSubtags(new_variant->data(), new_variant->length())) {
+ delete new_variant;
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ delete variant_;
+ variant_ = new_variant;
+ return *this;
+}
+
+static bool
+_isKeywordValue(const char* key, const char* value, int32_t value_len)
+{
+ if (key[1] == '\0') {
+ // one char key
+ return (UPRV_ISALPHANUM(uprv_tolower(key[0])) &&
+ _isExtensionSubtags(key[0], value, value_len));
+ } else if (uprv_strcmp(key, kAttributeKey) == 0) {
+ // unicode attributes
+ return ultag_isUnicodeLocaleAttributes(value, value_len);
+ }
+ // otherwise: unicode extension value
+ // We need to convert from legacy key/value to unicode
+ // key/value
+ const char* unicode_locale_key = uloc_toUnicodeLocaleKey(key);
+ const char* unicode_locale_type = uloc_toUnicodeLocaleType(key, value);
+
+ return unicode_locale_key && unicode_locale_type &&
+ ultag_isUnicodeLocaleKey(unicode_locale_key, -1) &&
+ ultag_isUnicodeLocaleType(unicode_locale_type, -1);
+}
+
+static void
+_copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& errorCode)
+{
+ if (U_FAILURE(errorCode)) { return; }
+ LocalPointer<icu::StringEnumeration> iter(from.createKeywords(errorCode));
+ if (U_FAILURE(errorCode) || iter.isNull()) { return; }
+ const char* key;
+ while ((key = iter->next(nullptr, errorCode)) != nullptr) {
+ CharString value;
+ CharStringByteSink sink(&value);
+ from.getKeywordValue(key, sink, errorCode);
+ if (U_FAILURE(errorCode)) { return; }
+ if (uprv_strcmp(key, kAttributeKey) == 0) {
+ transform(value.data(), value.length());
+ }
+ if (validate &&
+ !_isKeywordValue(key, value.data(), value.length())) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ to->setKeywordValue(key, value.data(), errorCode);
+ if (U_FAILURE(errorCode)) { return; }
+ }
+}
+
+void static
+_clearUAttributesAndKeyType(Locale* locale, UErrorCode& errorCode)
+{
+ // Clear Unicode attributes
+ locale->setKeywordValue(kAttributeKey, "", errorCode);
+
+ // Clear all Unicode keyword values
+ LocalPointer<icu::StringEnumeration> iter(locale->createUnicodeKeywords(errorCode));
+ if (U_FAILURE(errorCode) || iter.isNull()) { return; }
+ const char* key;
+ while ((key = iter->next(nullptr, errorCode)) != nullptr) {
+ locale->setUnicodeKeywordValue(key, nullptr, errorCode);
+ }
+}
+
+static void
+_setUnicodeExtensions(Locale* locale, const CharString& value, UErrorCode& errorCode)
+{
+ // Add the unicode extensions to extensions_
+ CharString locale_str("und-u-", errorCode);
+ locale_str.append(value, errorCode);
+ _copyExtensions(
+ Locale::forLanguageTag(locale_str.data(), errorCode),
+ locale, false, errorCode);
+}
+
+LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value)
+{
+ if (U_FAILURE(status_)) { return *this; }
+ if (!UPRV_ISALPHANUM(key)) {
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ CharString value_str(value, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ transform(value_str.data(), value_str.length());
+ if (!value_str.isEmpty() &&
+ !_isExtensionSubtags(key, value_str.data(), value_str.length())) {
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ if (extensions_ == nullptr) {
+ extensions_ = new Locale();
+ if (extensions_ == nullptr) {
+ status_ = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ }
+ if (uprv_tolower(key) != 'u') {
+ // for t, x and others extension.
+ extensions_->setKeywordValue(StringPiece(&key, 1), value_str.data(),
+ status_);
+ return *this;
+ }
+ _clearUAttributesAndKeyType(extensions_, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ if (!value.empty()) {
+ _setUnicodeExtensions(extensions_, value_str, status_);
+ }
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::setUnicodeLocaleKeyword(
+ StringPiece key, StringPiece type)
+{
+ if (U_FAILURE(status_)) { return *this; }
+ if (!ultag_isUnicodeLocaleKey(key.data(), key.length()) ||
+ (!type.empty() &&
+ !ultag_isUnicodeLocaleType(type.data(), type.length()))) {
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ if (extensions_ == nullptr) {
+ extensions_ = new Locale();
+ }
+ if (extensions_ == nullptr) {
+ status_ = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ extensions_->setUnicodeKeywordValue(key, type, status_);
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::addUnicodeLocaleAttribute(
+ StringPiece value)
+{
+ CharString value_str(value, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ transform(value_str.data(), value_str.length());
+ if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ if (extensions_ == nullptr) {
+ extensions_ = new Locale();
+ if (extensions_ == nullptr) {
+ status_ = U_MEMORY_ALLOCATION_ERROR;
+ return *this;
+ }
+ extensions_->setKeywordValue(kAttributeKey, value_str.data(), status_);
+ return *this;
+ }
+
+ CharString attributes;
+ CharStringByteSink sink(&attributes);
+ UErrorCode localErrorCode = U_ZERO_ERROR;
+ extensions_->getKeywordValue(kAttributeKey, sink, localErrorCode);
+ if (U_FAILURE(localErrorCode)) {
+ CharString new_attributes(value_str.data(), status_);
+ // No attributes, set the attribute.
+ extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
+ return *this;
+ }
+
+ transform(attributes.data(),attributes.length());
+ const char* start = attributes.data();
+ const char* limit = attributes.data() + attributes.length();
+ CharString new_attributes;
+ bool inserted = false;
+ while (start < limit) {
+ if (!inserted) {
+ int cmp = uprv_strcmp(start, value_str.data());
+ if (cmp == 0) { return *this; } // Found it in attributes: Just return
+ if (cmp > 0) {
+ if (!new_attributes.isEmpty()) new_attributes.append('_', status_);
+ new_attributes.append(value_str.data(), status_);
+ inserted = true;
+ }
+ }
+ if (!new_attributes.isEmpty()) {
+ new_attributes.append('_', status_);
+ }
+ new_attributes.append(start, status_);
+ start += uprv_strlen(start) + 1;
+ }
+ if (!inserted) {
+ if (!new_attributes.isEmpty()) {
+ new_attributes.append('_', status_);
+ }
+ new_attributes.append(value_str.data(), status_);
+ }
+ // Not yet in the attributes, set the attribute.
+ extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::removeUnicodeLocaleAttribute(
+ StringPiece value)
+{
+ CharString value_str(value, status_);
+ if (U_FAILURE(status_)) { return *this; }
+ transform(value_str.data(), value_str.length());
+ if (!ultag_isUnicodeLocaleAttribute(value_str.data(), value_str.length())) {
+ status_ = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ if (extensions_ == nullptr) { return *this; }
+ UErrorCode localErrorCode = U_ZERO_ERROR;
+ CharString attributes;
+ CharStringByteSink sink(&attributes);
+ extensions_->getKeywordValue(kAttributeKey, sink, localErrorCode);
+ // get failure, just return
+ if (U_FAILURE(localErrorCode)) { return *this; }
+ // Do not have any attributes, just return.
+ if (attributes.isEmpty()) { return *this; }
+
+ char* p = attributes.data();
+ // Replace null terminiator in place for _ and - so later
+ // we can use uprv_strcmp to compare.
+ for (int32_t i = 0; i < attributes.length(); i++, p++) {
+ *p = (*p == '_' || *p == '-') ? '\0' : uprv_tolower(*p);
+ }
+
+ const char* start = attributes.data();
+ const char* limit = attributes.data() + attributes.length();
+ CharString new_attributes;
+ bool found = false;
+ while (start < limit) {
+ if (uprv_strcmp(start, value_str.data()) == 0) {
+ found = true;
+ } else {
+ if (!new_attributes.isEmpty()) {
+ new_attributes.append('_', status_);
+ }
+ new_attributes.append(start, status_);
+ }
+ start += uprv_strlen(start) + 1;
+ }
+ // Found the value in attributes, set the attribute.
+ if (found) {
+ extensions_->setKeywordValue(kAttributeKey, new_attributes.data(), status_);
+ }
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::clear()
+{
+ status_ = U_ZERO_ERROR;
+ language_[0] = 0;
+ script_[0] = 0;
+ region_[0] = 0;
+ delete variant_;
+ variant_ = nullptr;
+ clearExtensions();
+ return *this;
+}
+
+LocaleBuilder& LocaleBuilder::clearExtensions()
+{
+ delete extensions_;
+ extensions_ = nullptr;
+ return *this;
+}
+
+Locale makeBogusLocale() {
+ Locale bogus;
+ bogus.setToBogus();
+ return bogus;
+}
+
+Locale LocaleBuilder::build(UErrorCode& errorCode)
+{
+ if (U_FAILURE(errorCode)) {
+ return makeBogusLocale();
+ }
+ if (U_FAILURE(status_)) {
+ errorCode = status_;
+ return makeBogusLocale();
+ }
+ CharString locale_str(language_, errorCode);
+ if (uprv_strlen(script_) > 0) {
+ locale_str.append('-', errorCode).append(StringPiece(script_), errorCode);
+ }
+ if (uprv_strlen(region_) > 0) {
+ locale_str.append('-', errorCode).append(StringPiece(region_), errorCode);
+ }
+ if (variant_ != nullptr) {
+ locale_str.append('-', errorCode).append(StringPiece(variant_->data()), errorCode);
+ }
+ if (U_FAILURE(errorCode)) {
+ return makeBogusLocale();
+ }
+ Locale product(locale_str.data());
+ if (extensions_ != nullptr) {
+ _copyExtensions(*extensions_, &product, true, errorCode);
+ }
+ if (U_FAILURE(errorCode)) {
+ return makeBogusLocale();
+ }
+ return product;
+}
+
+U_NAMESPACE_END
}
static UBool
-_isLanguageSubtag(const char* s, int32_t len) {
+_isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
+ if (len < 0) {
+ len = (int32_t)uprv_strlen(s);
+ }
+ if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+U_CFUNC UBool
+ultag_isLanguageSubtag(const char* s, int32_t len) {
/*
- * language = 2*3ALPHA ; shortest ISO 639 code
- * ["-" extlang] ; sometimes followed by
- * ; extended language subtags
- * / 4ALPHA ; or reserved for future use
- * / 5*8ALPHA ; or registered language subtag
+ * unicode_language_subtag = alpha{2,3} | alpha{5,8};
+ * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
+ * See ICU-20372
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
return FALSE;
}
-static UBool
-_isScriptSubtag(const char* s, int32_t len) {
+U_CFUNC UBool
+ultag_isScriptSubtag(const char* s, int32_t len) {
/*
* script = 4ALPHA ; ISO 15924 code
*/
return FALSE;
}
-static UBool
-_isRegionSubtag(const char* s, int32_t len) {
+U_CFUNC UBool
+ultag_isRegionSubtag(const char* s, int32_t len) {
/*
* region = 2ALPHA ; ISO 3166-1 code
* / 3DIGIT ; UN M.49 code
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
- if (len >= 5 && len <= 8 && _isAlphaNumericString(s, len)) {
+ if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
return TRUE;
}
if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
return FALSE;
}
+static UBool
+_isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
+ const char *p = s;
+ const char *pSubtag = NULL;
+
+ if (len < 0) {
+ len = (int32_t)uprv_strlen(s);
+ }
+
+ while ((p - s) < len) {
+ if (*p == SEP) {
+ if (pSubtag == NULL) {
+ return FALSE;
+ }
+ if (!test(pSubtag, (int32_t)(p - pSubtag))) {
+ return FALSE;
+ }
+ pSubtag = NULL;
+ } else if (pSubtag == NULL) {
+ pSubtag = p;
+ }
+ p++;
+ }
+ if (pSubtag == NULL) {
+ return FALSE;
+ }
+ return test(pSubtag, (int32_t)(p - pSubtag));
+}
+
+U_CFUNC UBool
+ultag_isVariantSubtags(const char* s, int32_t len) {
+ return _isSepListOf(&_isVariantSubtag, s, len);
+}
+
+// This is for the ICU-specific "lvariant" handling.
static UBool
_isPrivateuseVariantSubtag(const char* s, int32_t len) {
/*
* variant = 1*8alphanum ; registered variants
* / (DIGIT 3alphanum)
*/
- if (len < 0) {
- len = (int32_t)uprv_strlen(s);
- }
- if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) {
- return TRUE;
- }
- return FALSE;
+ return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
}
static UBool
/*
* extension = singleton 1*("-" (2*8alphanum))
*/
+ return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
+}
+
+U_CFUNC UBool
+ultag_isExtensionSubtags(const char* s, int32_t len) {
+ return _isSepListOf(&_isExtensionSubtag, s, len);
+}
+
+static UBool
+_isPrivateuseValueSubtag(const char* s, int32_t len) {
+ /*
+ * privateuse = "x" 1*("-" (1*8alphanum))
+ */
+ return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
+}
+
+U_CFUNC UBool
+ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
+ return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
+}
+
+U_CFUNC UBool
+ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
+ /*
+ * attribute = alphanum{3,8} ;
+ */
+ return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
+}
+
+U_CFUNC UBool
+ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
+ return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
+}
+
+U_CFUNC UBool
+ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
+ /*
+ * key = alphanum alpha ;
+ */
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
- if (len >= 2 && len <= 8 && _isAlphaNumericString(s, len)) {
+ if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
return TRUE;
}
return FALSE;
}
-static UBool
-_isExtensionSubtags(const char* s, int32_t len) {
- const char *p = s;
- const char *pSubtag = NULL;
-
- if (len < 0) {
- len = (int32_t)uprv_strlen(s);
- }
+U_CFUNC UBool
+_isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
+ /*
+ * alphanum{3,8}
+ */
+ return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
+}
- while ((p - s) < len) {
- if (*p == SEP) {
- if (pSubtag == NULL) {
- return FALSE;
- }
- if (!_isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag))) {
- return FALSE;
- }
- pSubtag = NULL;
- } else if (pSubtag == NULL) {
- pSubtag = p;
- }
- p++;
- }
- if (pSubtag == NULL) {
- return FALSE;
- }
- return _isExtensionSubtag(pSubtag, (int32_t)(p - pSubtag));
+U_CFUNC UBool
+ultag_isUnicodeLocaleType(const char*s, int32_t len) {
+ /*
+ * type = alphanum{3,8} (sep alphanum{3,8})* ;
+ */
+ return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
}
static UBool
-_isPrivateuseValueSubtag(const char* s, int32_t len) {
+_isTKey(const char* s, int32_t len)
+{
/*
- * privateuse = "x" 1*("-" (1*8alphanum))
+ * tkey = alpha digit ;
*/
if (len < 0) {
len = (int32_t)uprv_strlen(s);
}
- if (len >= 1 && len <= 8 && _isAlphaNumericString(s, len)) {
+ if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
return TRUE;
}
return FALSE;
}
static UBool
-_isPrivateuseValueSubtags(const char* s, int32_t len) {
- const char *p = s;
- const char *pSubtag = NULL;
-
- if (len < 0) {
- len = (int32_t)uprv_strlen(s);
- }
+_isTValue(const char* s, int32_t len)
+{
+ /*
+ * tvalue = (sep alphanum{3,8})+ ;
+ */
+ return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
+}
- while ((p - s) < len) {
- if (*p == SEP) {
- if (pSubtag == NULL) {
- return FALSE;
+static UBool
+_isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
+{
+ const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end
+ const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
+ // unicode_region_subtag, unicode_variant_subtag, tkey or end
+ const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
+ // unicode_variant_subtag, tkey, or end
+ const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
+ // tkey, or end.
+ const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
+ // tkey or end.
+ const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here.
+ const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end
+
+ switch (state) {
+ case kStart:
+ if (ultag_isLanguageSubtag(s, len)) {
+ state = kGotLanguage;
+ return TRUE;
+ }
+ if (_isTKey(s, len)) {
+ state = kGotTKey;
+ return TRUE;
}
- if (!_isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag))) {
- return FALSE;
+ return FALSE;
+ case kGotLanguage:
+ if (ultag_isScriptSubtag(s, len)) {
+ state = kGotScript;
+ return TRUE;
+ }
+ U_FALLTHROUGH;
+ case kGotScript:
+ if (ultag_isRegionSubtag(s, len)) {
+ state = kGotRegion;
+ return TRUE;
+ }
+ U_FALLTHROUGH;
+ case kGotRegion:
+ U_FALLTHROUGH;
+ case kGotVariant:
+ if (_isVariantSubtag(s, len)) {
+ state = kGotVariant;
+ return TRUE;
+ }
+ if (_isTKey(s, len)) {
+ state = kGotTKey;
+ return TRUE;
}
- pSubtag = NULL;
- } else if (pSubtag == NULL) {
- pSubtag = p;
- }
- p++;
- }
- if (pSubtag == NULL) {
- return FALSE;
+ return FALSE;
+ case kGotTKey:
+ if (_isTValue(s, len)) {
+ state = kGotTValue;
+ return TRUE;
+ }
+ return FALSE;
+ case kGotTValue:
+ if (_isTKey(s, len)) {
+ state = kGotTKey;
+ return TRUE;
+ }
+ if (_isTValue(s, len)) {
+ return TRUE;
+ }
+ return FALSE;
}
- return _isPrivateuseValueSubtag(pSubtag, (int32_t)(p - pSubtag));
+ return FALSE;
}
-U_CFUNC UBool
-ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
- if (len < 0) {
- len = (int32_t)uprv_strlen(s);
- }
- if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
- return TRUE;
+static UBool
+_isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
+{
+ const int32_t kStart = 0; // Start, wait for a key or attribute or end
+ const int32_t kGotKey = 1; // Got a key, wait for type or key or end
+ const int32_t kGotType = 2; // Got a type, wait for key or end
+
+ switch (state) {
+ case kStart:
+ if (ultag_isUnicodeLocaleKey(s, len)) {
+ state = kGotKey;
+ return TRUE;
+ }
+ if (ultag_isUnicodeLocaleAttribute(s, len)) {
+ return TRUE;
+ }
+ return FALSE;
+ case kGotKey:
+ if (ultag_isUnicodeLocaleKey(s, len)) {
+ return TRUE;
+ }
+ if (_isUnicodeLocaleTypeSubtag(s, len)) {
+ state = kGotType;
+ return TRUE;
+ }
+ return FALSE;
+ case kGotType:
+ if (ultag_isUnicodeLocaleKey(s, len)) {
+ state = kGotKey;
+ return TRUE;
+ }
+ if (_isUnicodeLocaleTypeSubtag(s, len)) {
+ return TRUE;
+ }
+ return FALSE;
}
return FALSE;
}
-U_CFUNC UBool
-ultag_isUnicodeLocaleType(const char*s, int32_t len) {
+static UBool
+_isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
+{
+ int32_t state = 0;
const char* p;
+ const char* start = s;
int32_t subtagLen = 0;
if (len < 0) {
for (p = s; len > 0; p++, len--) {
if (*p == SEP) {
- if (subtagLen < 3) {
+ if (!test(state, start, subtagLen)) {
return FALSE;
}
subtagLen = 0;
- } else if (ISALPHA(*p) || ISNUMERIC(*p)) {
- subtagLen++;
- if (subtagLen > 8) {
- return FALSE;
- }
+ start = p + 1;
} else {
- return FALSE;
+ subtagLen++;
}
}
- return (subtagLen >= 3);
+ if (test(state, start, subtagLen) && state >= 0) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+U_CFUNC UBool
+ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
+{
+ return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
}
+
+U_CFUNC UBool
+ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
+ return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
+}
+
+
/*
* -------------------------------------------------
*
if (len == 0) {
sink.Append(LANG_UND, LANG_UND_LEN);
- } else if (!_isLanguageSubtag(buf, len)) {
+ } else if (!ultag_isLanguageSubtag(buf, len)) {
/* invalid language code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (len > 0) {
- if (!_isScriptSubtag(buf, len)) {
+ if (!ultag_isScriptSubtag(buf, len)) {
/* invalid script code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (len > 0) {
- if (!_isRegionSubtag(buf, len)) {
+ if (!ultag_isRegionSubtag(buf, len)) {
/* invalid region code */
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else {
if (*key == PRIVATEUSE) {
- if (!_isPrivateuseValueSubtags(buf.data(), len)) {
+ if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
continue;
}
} else {
- if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) {
+ if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
subtagLen = (int32_t)(pSep - pSubtag);
if (next & LANG) {
- if (_isLanguageSubtag(pSubtag, subtagLen)) {
+ if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
*pSep = 0; /* terminate */
// TODO: move deprecated language code handling here.
t->language = T_CString_toLowerCase(pSubtag);
}
}
if (next & SCRT) {
- if (_isScriptSubtag(pSubtag, subtagLen)) {
+ if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
char *p = pSubtag;
*pSep = 0;
}
}
if (next & REGN) {
- if (_isRegionSubtag(pSubtag, subtagLen)) {
+ if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
*pSep = 0;
// TODO: move deprecated region code handling here.
t->region = T_CString_toUpperCase(pSubtag);
buf[1] = SEP;
len = uloc_getKeywordValue(localeID, key, &buf[2], sizeof(buf) - 2, &tmpStatus);
if (U_SUCCESS(tmpStatus)) {
- if (_isPrivateuseValueSubtags(&buf[2], len)) {
+ if (ultag_isPrivateuseValueSubtags(&buf[2], len)) {
/* return private use only tag */
sink.Append(buf, len + 2);
done = TRUE;
--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include <memory>
+
+#include "cmemory.h"
+#include "cstring.h"
+#include "localebuildertest.h"
+#include "unicode/localebuilder.h"
+#include "unicode/strenum.h"
+
+LocaleBuilderTest::LocaleBuilderTest()
+{
+}
+
+LocaleBuilderTest::~LocaleBuilderTest()
+{
+}
+
+void LocaleBuilderTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
+{
+ TESTCASE_AUTO_BEGIN;
+ TESTCASE_AUTO(TestAddRemoveUnicodeLocaleAttribute);
+ TESTCASE_AUTO(TestAddRemoveUnicodeLocaleAttributeWellFormed);
+ TESTCASE_AUTO(TestAddUnicodeLocaleAttributeIllFormed);
+ TESTCASE_AUTO(TestLocaleBuilder);
+ TESTCASE_AUTO(TestLocaleBuilderBasic);
+ TESTCASE_AUTO(TestPosixCases);
+ TESTCASE_AUTO(TestSetExtensionOthers);
+ TESTCASE_AUTO(TestSetExtensionPU);
+ TESTCASE_AUTO(TestSetExtensionT);
+ TESTCASE_AUTO(TestSetExtensionU);
+ TESTCASE_AUTO(TestSetExtensionValidateOthersIllFormed);
+ TESTCASE_AUTO(TestSetExtensionValidateOthersWellFormed);
+ TESTCASE_AUTO(TestSetExtensionValidatePUIllFormed);
+ TESTCASE_AUTO(TestSetExtensionValidatePUWellFormed);
+ TESTCASE_AUTO(TestSetExtensionValidateTIllFormed);
+ TESTCASE_AUTO(TestSetExtensionValidateTWellFormed);
+ TESTCASE_AUTO(TestSetExtensionValidateUIllFormed);
+ TESTCASE_AUTO(TestSetExtensionValidateUWellFormed);
+ TESTCASE_AUTO(TestSetLanguageIllFormed);
+ TESTCASE_AUTO(TestSetLanguageWellFormed);
+ TESTCASE_AUTO(TestSetLocale);
+ TESTCASE_AUTO(TestSetRegionIllFormed);
+ TESTCASE_AUTO(TestSetRegionWellFormed);
+ TESTCASE_AUTO(TestSetScriptIllFormed);
+ TESTCASE_AUTO(TestSetScriptWellFormed);
+ TESTCASE_AUTO(TestSetUnicodeLocaleKeywordIllFormedKey);
+ TESTCASE_AUTO(TestSetUnicodeLocaleKeywordIllFormedValue);
+ TESTCASE_AUTO(TestSetUnicodeLocaleKeywordWellFormed);
+ TESTCASE_AUTO(TestSetVariantIllFormed);
+ TESTCASE_AUTO(TestSetVariantWellFormed);
+ TESTCASE_AUTO_END;
+}
+
+void LocaleBuilderTest::Verify(LocaleBuilder& bld, const char* expected, const char* msg) {
+ UErrorCode status = U_ZERO_ERROR;
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln(msg, u_errorName(status));
+ }
+ std::string tag = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status)) {
+ errln("loc.toLanguageTag() got Error: %s\n",
+ u_errorName(status));
+ }
+ if (tag != expected) {
+ errln("should get \"%s\", but got \"%s\"\n", expected, tag.c_str());
+ }
+}
+
+void LocaleBuilderTest::TestLocaleBuilder() {
+ // The following test data are copy from
+ // icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleBuilderTest.java
+ // "L": +1 = language
+ // "S": +1 = script
+ // "R": +1 = region
+ // "V": +1 = variant
+ // "K": +1 = Unicode locale key / +2 = Unicode locale type
+ // "A": +1 = Unicode locale attribute
+ // "E": +1 = extension letter / +2 = extension value
+ // "P": +1 = private use
+ // "U": +1 = ULocale
+ // "B": +1 = BCP47 language tag
+ // "C": Clear all
+ // "N": Clear extensions
+ // "D": +1 = Unicode locale attribute to be removed
+ // "X": indicates an exception must be thrown
+ // "T": +1 = expected language tag / +2 = expected locale string
+ const char* TESTCASES[][14] = {
+ {"L", "en", "R", "us", "T", "en-US", "en_US"},
+ {"L", "en", "R", "CA", "L", nullptr, "T", "und-CA", "_CA"},
+ {"L", "en", "R", "CA", "L", "", "T", "und-CA", "_CA"},
+ {"L", "en", "R", "FR", "L", "fr", "T", "fr-FR", "fr_FR"},
+ {"L", "123", "X"},
+ {"R", "us", "T", "und-US", "_US"},
+ {"R", "usa", "X"},
+ {"R", "123", "L", "it", "R", nullptr, "T", "it", "it"},
+ {"R", "123", "L", "it", "R", "", "T", "it", "it"},
+ {"R", "123", "L", "en", "T", "en-123", "en_123"},
+ {"S", "LATN", "L", "DE", "T", "de-Latn", "de_Latn"},
+ {"L", "De", "S", "latn", "R", "de", "S", "", "T", "de-DE", "de_DE"},
+ {"L", "De", "S", "Arab", "R", "de", "S", nullptr, "T", "de-DE", "de_DE"},
+ {"S", "latin", "X"},
+ {"V", "1234", "L", "en", "T", "en-1234", "en__1234"},
+ {"V", "1234", "L", "en", "V", "5678", "T", "en-5678", "en__5678"},
+ {"V", "1234", "L", "en", "V", nullptr, "T", "en", "en"},
+ {"V", "1234", "L", "en", "V", "", "T", "en", "en"},
+ {"V", "123", "X"},
+ {"U", "en_US", "T", "en-US", "en_US"},
+ {"U", "en_US_WIN", "X"},
+ {"B", "fr-FR-1606nict-u-ca-gregory-x-test", "T",
+ "fr-FR-1606nict-u-ca-gregory-x-test",
+ "fr_FR_1606NICT@calendar=gregorian;x=test"},
+ {"B", "ab-cde-fghij", "T", "cde-fghij", "cde__FGHIJ"},
+ {"B", "und-CA", "T", "und-CA", "_CA"},
+ // Blocked by ICU-20327
+ // {"B", "en-US-x-test-lvariant-var", "T", "en-US-x-test-lvariant-var",
+ // "en_US_VAR@x=test"},
+ {"B", "en-US-VAR", "X"},
+ {"U", "ja_JP@calendar=japanese;currency=JPY", "L", "ko", "T",
+ "ko-JP-u-ca-japanese-cu-jpy", "ko_JP@calendar=japanese;currency=JPY"},
+ {"U", "ja_JP@calendar=japanese;currency=JPY", "K", "ca", nullptr, "T",
+ "ja-JP-u-cu-jpy", "ja_JP@currency=JPY"},
+ {"U", "ja_JP@calendar=japanese;currency=JPY", "E", "u",
+ "attr1-ca-gregory", "T", "ja-JP-u-attr1-ca-gregory",
+ "ja_JP@attribute=attr1;calendar=gregorian"},
+ {"U", "en@colnumeric=yes", "K", "kn", "true", "T", "en-u-kn-true",
+ "en@colnumeric=yes"},
+ {"L", "th", "R", "th", "K", "nu", "thai", "T", "th-TH-u-nu-thai",
+ "th_TH@numbers=thai"},
+ {"U", "zh_Hans", "R", "sg", "K", "ca", "badcalendar", "X"},
+ {"U", "zh_Hans", "R", "sg", "K", "cal", "gregory", "X"},
+ {"E", "z", "ExtZ", "L", "en", "T", "en-z-extz", "en@z=extz"},
+ {"E", "z", "ExtZ", "L", "en", "E", "z", "", "T", "en", "en"},
+ {"E", "z", "ExtZ", "L", "en", "E", "z", nullptr, "T", "en", "en"},
+ {"E", "a", "x", "X"},
+ {"E", "a", "abc_def", "T", "und-a-abc-def", "@a=abc-def"},
+ // Design limitation - typeless u extension keyword 0a below is interpreted as a boolean value true/yes.
+ // With the legacy keyword syntax, "yes" is used for such boolean value instead of "true".
+ // However, once the legacy keyword is translated back to BCP 47 u extension, key "0a" is unknown,
+ // so "yes" is preserved - not mapped to "true". We could change the code to automatically transform
+ // key = alphanum alpha
+ {"L", "en", "E", "u", "bbb-aaa-0a", "T", "en-u-aaa-bbb-0a-yes",
+ "en@0a=yes;attribute=aaa-bbb"},
+ {"L", "fr", "R", "FR", "P", "Yoshito-ICU", "T", "fr-FR-x-yoshito-icu",
+ "fr_FR@x=yoshito-icu"},
+ {"L", "ja", "R", "jp", "K", "ca", "japanese", "T", "ja-JP-u-ca-japanese",
+ "ja_JP@calendar=japanese"},
+ {"K", "co", "PHONEBK", "K", "ca", "gregory", "L", "De", "T",
+ "de-u-ca-gregory-co-phonebk", "de@calendar=gregorian;collation=phonebook"},
+ {"E", "o", "OPQR", "E", "a", "aBcD", "T", "und-a-abcd-o-opqr", "@a=abcd;o=opqr"},
+ {"E", "u", "nu-thai-ca-gregory", "L", "TH", "T", "th-u-ca-gregory-nu-thai",
+ "th@calendar=gregorian;numbers=thai"},
+ {"L", "en", "K", "tz", "usnyc", "R", "US", "T", "en-US-u-tz-usnyc",
+ "en_US@timezone=America/New_York"},
+ {"L", "de", "K", "co", "phonebk", "K", "ks", "level1", "K", "kk",
+ "true", "T", "de-u-co-phonebk-kk-true-ks-level1",
+ "de@collation=phonebook;colnormalization=yes;colstrength=primary"},
+ {"L", "en", "R", "US", "K", "ca", "gregory", "T", "en-US-u-ca-gregory",
+ "en_US@calendar=gregorian"},
+ {"L", "en", "R", "US", "K", "cal", "gregory", "X"},
+ {"L", "en", "R", "US", "K", "ca", "gregorian", "X"},
+ {"L", "en", "R", "US", "K", "kn", "true", "T", "en-US-u-kn-true",
+ "en_US@colnumeric=yes"},
+ {"B", "de-DE-u-co-phonebk", "C", "L", "pt", "T", "pt", "pt"},
+ {"B", "ja-jp-u-ca-japanese", "N", "T", "ja-JP", "ja_JP"},
+ {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "T",
+ "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"},
+ {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "D", "def", "T",
+ "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"},
+ {"L", "en", "A", "aa", "X"},
+ {"B", "fr-u-attr1-cu-eur", "D", "attribute1", "X"},
+ };
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ for (int tidx = 0; tidx < UPRV_LENGTHOF(TESTCASES); tidx++) {
+ const char* (&testCase)[14] = TESTCASES[tidx];
+ std::string actions;
+ for (int p = 0; p < UPRV_LENGTHOF(testCase); p++) {
+ if (testCase[p] == nullptr) {
+ actions += " (nullptr)";
+ break;
+ }
+ if (p > 0) actions += " ";
+ actions += testCase[p];
+ }
+ int i = 0;
+ const char* method;
+ status = U_ZERO_ERROR;
+ bld.clear();
+ while (true) {
+ method = testCase[i++];
+ if (strcmp("L", method) == 0) {
+ bld.setLanguage(testCase[i++]).build(status);
+ } else if (strcmp("S", method) == 0) {
+ bld.setScript(testCase[i++]).build(status);
+ } else if (strcmp("R", method) == 0) {
+ bld.setRegion(testCase[i++]).build(status);
+ } else if (strcmp("V", method) == 0) {
+ bld.setVariant(testCase[i++]).build(status);
+ } else if (strcmp("K", method) == 0) {
+ const char* key = testCase[i++];
+ const char* type = testCase[i++];
+ bld.setUnicodeLocaleKeyword(key, type).build(status);
+ } else if (strcmp("A", method) == 0) {
+ bld.addUnicodeLocaleAttribute(testCase[i++]).build(status);
+ } else if (strcmp("E", method) == 0) {
+ const char* key = testCase[i++];
+ const char* value = testCase[i++];
+ bld.setExtension(key[0], value).build(status);
+ } else if (strcmp("P", method) == 0) {
+ bld.setExtension('x', testCase[i++]).build(status);
+ } else if (strcmp("U", method) == 0) {
+ bld.setLocale(Locale(testCase[i++])).build(status);
+ } else if (strcmp("B", method) == 0) {
+ bld.setLanguageTag(testCase[i++]).build(status);
+ }
+ // clear / remove
+ else if (strcmp("C", method) == 0) {
+ bld.clear().build(status);
+ } else if (strcmp("N", method) == 0) {
+ bld.clearExtensions().build(status);
+ } else if (strcmp("D", method) == 0) {
+ bld.removeUnicodeLocaleAttribute(testCase[i++]).build(status);
+ }
+ // result
+ else if (strcmp("X", method) == 0) {
+ if (U_SUCCESS(status)) {
+ errln("FAIL: No error return - test case: %s", actions.c_str());
+ }
+ } else if (strcmp("T", method) == 0) {
+ status = U_ZERO_ERROR;
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status) ||
+ strcmp(loc.getName(), testCase[i + 1]) != 0) {
+ errln("FAIL: Wrong locale ID - %s %s %s", loc.getName(),
+ " for test case: ", actions.c_str());
+ }
+ std::string langtag = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || langtag != testCase[i]) {
+ errln("FAIL: Wrong language tag - %s %s %s", langtag.c_str(),
+ " for test case: ", actions.c_str());
+ }
+ break;
+ } else {
+ // Unknow test method
+ errln("Unknown test case method: There is an error in the test case data.");
+ break;
+ }
+ if (U_FAILURE(status)) {
+ if (strcmp("X", testCase[i]) == 0) {
+ // This failure is expected
+ break;
+ } else {
+ errln("FAIL: U_ILLEGAL_ARGUMENT_ERROR at offset %d %s %s", i,
+ " in test case: ", actions.c_str());
+ break;
+ }
+ }
+ if (strcmp("T", method) == 0) {
+ break;
+ }
+ } // while(true)
+ } // for TESTCASES
+}
+
+void LocaleBuilderTest::TestLocaleBuilderBasic() {
+ LocaleBuilder bld;
+ bld.setLanguage("zh");
+ Verify(bld, "zh", "setLanguage('zh') got Error: %s\n");
+
+ bld.setScript("Hant");
+ Verify(bld, "zh-Hant", "setScript('Hant') got Error: %s\n");
+
+ bld.setRegion("SG");
+ Verify(bld, "zh-Hant-SG", "setRegion('SG') got Error: %s\n");
+
+ bld.setRegion("HK");
+ bld.setScript("Hans");
+ Verify(bld, "zh-Hans-HK",
+ "setRegion('HK') and setScript('Hans') got Error: %s\n");
+
+ bld.setVariant("revised");
+ Verify(bld, "zh-Hans-HK-revised",
+ "setVariant('revised') got Error: %s\n");
+
+ bld.setUnicodeLocaleKeyword("nu", "thai");
+ Verify(bld, "zh-Hans-HK-revised-u-nu-thai",
+ "setUnicodeLocaleKeyword('nu', 'thai'') got Error: %s\n");
+
+ bld.setUnicodeLocaleKeyword("co", "pinyin");
+ Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-thai",
+ "setUnicodeLocaleKeyword('co', 'pinyin'') got Error: %s\n");
+
+ bld.setUnicodeLocaleKeyword("nu", "latn");
+ Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-latn",
+ "setUnicodeLocaleKeyword('nu', 'latn'') got Error: %s\n");
+
+ bld.setUnicodeLocaleKeyword("nu", nullptr);
+ Verify(bld, "zh-Hans-HK-revised-u-co-pinyin",
+ "setUnicodeLocaleKeyword('nu', ''') got Error: %s\n");
+
+ bld.setUnicodeLocaleKeyword("co", nullptr);
+ Verify(bld, "zh-Hans-HK-revised",
+ "setUnicodeLocaleKeyword('nu', nullptr) got Error: %s\n");
+
+ bld.setScript("");
+ Verify(bld, "zh-HK-revised",
+ "setScript('') got Error: %s\n");
+
+ bld.setVariant("");
+ Verify(bld, "zh-HK",
+ "setVariant('') got Error: %s\n");
+
+ bld.setRegion("");
+ Verify(bld, "zh",
+ "setRegion('') got Error: %s\n");
+}
+
+void LocaleBuilderTest::TestSetLanguageWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag
+ // unicode_language_subtag = alpha{2,3} | alpha{5,8};
+ // ICUTC decided also support alpha{4}
+ static const char* wellFormedLanguages[] = {
+ "",
+
+ // alpha{2}
+ "en",
+ "NE",
+ "eN",
+ "Ne",
+
+ // alpha{3}
+ "aNe",
+ "zzz",
+ "AAA",
+
+ // alpha{4}
+ "ABCD",
+ "abcd",
+
+ // alpha{5}
+ "efgij",
+ "AbCAD",
+ "ZAASD",
+
+ // alpha{6}
+ "efgijk",
+ "AADGFE",
+ "AkDfFz",
+
+ // alpha{7}
+ "asdfads",
+ "ADSFADF",
+ "piSFkDk",
+
+ // alpha{8}
+ "oieradfz",
+ "IADSFJKR",
+ "kkDSFJkR",
+ };
+ for (const char* lang : wellFormedLanguages) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setLanguage(lang);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setLanguage(\"%s\") got Error: %s\n",
+ lang, u_errorName(status));
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetLanguageIllFormed() {
+ static const char* illFormed[] = {
+ "a",
+ "z",
+ "A",
+ "F",
+ "2",
+ "0",
+ "9"
+ "{",
+ ".",
+ "[",
+ "]",
+ "\\",
+
+ "e1",
+ "N2",
+ "3N",
+ "4e",
+ "e:",
+ "43",
+ "a9",
+
+ "aN0",
+ "z1z",
+ "2zz",
+ "3A3",
+ "456",
+ "af)",
+
+ // Per 2019-01-23 ICUTC, we still accept 4alpha as tlang. see ICU-20321.
+ // "latn",
+ // "Arab",
+ // "LATN",
+
+ "e)gij",
+ "Ab3AD",
+ "ZAAS8",
+
+ "efgi[]",
+ "AA9GFE",
+ "7kD3Fz",
+ "as8fads",
+ "0DSFADF",
+ "'iSFkDk",
+
+ "oieradf+",
+ "IADSFJK-",
+ "kkDSFJk0",
+
+ // alpha{9}
+ "oieradfab",
+ "IADSFJKDE",
+ "kkDSFJkzf",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setLanguage(ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setLanguage(\"%s\") should fail but has no Error\n", ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetScriptWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag
+ // unicode_script_subtag = alpha{4} ;
+ static const char* wellFormedScripts[] = {
+ "",
+
+ "Latn",
+ "latn",
+ "lATN",
+ "laTN",
+ "arBN",
+ "ARbn",
+ "adsf",
+ "aADF",
+ "BSVS",
+ "LATn",
+ };
+ for (const char* script : wellFormedScripts) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setScript(script);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setScript(\"%s\") got Error: %s\n",
+ script, u_errorName(status));
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetScriptIllFormed() {
+ static const char* illFormed[] = {
+ "a",
+ "z",
+ "A",
+ "F",
+ "2",
+ "0",
+ "9"
+ "{",
+ ".",
+ "[",
+ "]",
+ "\\",
+
+ "e1",
+ "N2",
+ "3N",
+ "4e",
+ "e:",
+ "43",
+ "a9",
+
+ "aN0",
+ "z1z",
+ "2zz",
+ "3A3",
+ "456",
+ "af)",
+
+ "0atn",
+ "l1tn",
+ "lA2N",
+ "la4N",
+ "arB5",
+ "1234",
+
+ "e)gij",
+ "Ab3AD",
+ "ZAAS8",
+
+ "efgi[]",
+ "AA9GFE",
+ "7kD3Fz",
+
+ "as8fads",
+ "0DSFADF",
+ "'iSFkDk",
+
+ "oieradf+",
+ "IADSFJK-",
+ "kkDSFJk0",
+
+ // alpha{9}
+ "oieradfab",
+ "IADSFJKDE",
+ "kkDSFJkzf",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setScript(ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setScript(\"%s\") should fail but has no Error\n", ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetRegionWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag
+ // unicode_region_subtag = (alpha{2} | digit{3})
+ static const char* wellFormedRegions[] = {
+ "",
+
+ // alpha{2}
+ "en",
+ "NE",
+ "eN",
+ "Ne",
+
+ // digit{3}
+ "000",
+ "999",
+ "123",
+ "987"
+ };
+ for (const char* region : wellFormedRegions) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setRegion(region);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setRegion(\"%s\") got Error: %s\n",
+ region, u_errorName(status));
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetRegionIllFormed() {
+ static const char* illFormed[] = {
+ "a",
+ "z",
+ "A",
+ "F",
+ "2",
+ "0",
+ "9"
+ "{",
+ ".",
+ "[",
+ "]",
+ "\\",
+
+ "e1",
+ "N2",
+ "3N",
+ "4e",
+ "e:",
+ "43",
+ "a9",
+
+ "aN0",
+ "z1z",
+ "2zz",
+ "3A3",
+ "4.6",
+ "af)",
+
+ "0atn",
+ "l1tn",
+ "lA2N",
+ "la4N",
+ "arB5",
+ "1234",
+
+ "e)gij",
+ "Ab3AD",
+ "ZAAS8",
+
+ "efgi[]",
+ "AA9GFE",
+ "7kD3Fz",
+
+ "as8fads",
+ "0DSFADF",
+ "'iSFkDk",
+
+ "oieradf+",
+ "IADSFJK-",
+ "kkDSFJk0",
+
+ // alpha{9}
+ "oieradfab",
+ "IADSFJKDE",
+ "kkDSFJkzf",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setRegion(ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setRegion(\"%s\") should fail but has no Error\n", ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetVariantWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag
+ // (sep unicode_variant_subtag)*
+ // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
+ static const char* wellFormedVariants[] = {
+ "",
+
+ // alphanum{5}
+ "efgij",
+ "AbCAD",
+ "ZAASD",
+ "0AASD",
+ "A1CAD",
+ "ef2ij",
+ "ads3X",
+ "owqF4",
+
+ // alphanum{6}
+ "efgijk",
+ "AADGFE",
+ "AkDfFz",
+ "0ADGFE",
+ "A9DfFz",
+ "AADG7E",
+
+ // alphanum{7}
+ "asdfads",
+ "ADSFADF",
+ "piSFkDk",
+ "a0dfads",
+ "ADSF3DF",
+ "piSFkD9",
+
+ // alphanum{8}
+ "oieradfz",
+ "IADSFJKR",
+ "kkDSFJkR",
+ "0ADSFJKR",
+ "12345679",
+
+ // digit alphanum{3}
+ "0123",
+ "1abc",
+ "20EF",
+ "30EF",
+ "8A03",
+ "3Ax3",
+ "9Axy",
+
+ // (sep unicode_variant_subtag)*
+ "0123-4567",
+ "0ab3-ABCDE",
+ "9ax3-xByD9",
+ "9ax3-xByD9-adfk934a",
+
+ "0123_4567",
+ "0ab3_ABCDE",
+ "9ax3_xByD9",
+ "9ax3_xByD9_adfk934a",
+
+ "9ax3-xByD9_adfk934a",
+ "9ax3_xByD9-adfk934a",
+ };
+ for (const char* variant : wellFormedVariants) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setVariant(variant);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setVariant(\"%s\") got Error: %s\n",
+ variant, u_errorName(status));
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetVariantIllFormed() {
+ static const char* illFormed[] = {
+ "a",
+ "z",
+ "A",
+ "F",
+ "2",
+ "0",
+ "9"
+ "{",
+ ".",
+ "[",
+ "]",
+ "\\",
+
+ "e1",
+ "N2",
+ "3N",
+ "4e",
+ "e:",
+ "43",
+ "a9",
+ "en",
+ "NE",
+ "eN",
+ "Ne",
+
+ "aNe",
+ "zzz",
+ "AAA",
+ "aN0",
+ "z1z",
+ "2zz",
+ "3A3",
+ "4.6",
+ "af)",
+ "345",
+ "923",
+
+ "Latn",
+ "latn",
+ "lATN",
+ "laTN",
+ "arBN",
+ "ARbn",
+ "adsf",
+ "aADF",
+ "BSVS",
+ "LATn",
+ "l1tn",
+ "lA2N",
+ "la4N",
+ "arB5",
+ "abc3",
+ "A3BC",
+
+ "e)gij",
+ "A+3AD",
+ "ZAA=8",
+
+ "efgi[]",
+ "AA9]FE",
+ "7k[3Fz",
+
+ "as8f/ds",
+ "0DSFAD{",
+ "'iSFkDk",
+
+ "oieradf+",
+ "IADSFJK-",
+ "k}DSFJk0",
+
+ // alpha{9}
+ "oieradfab",
+ "IADSFJKDE",
+ "kkDSFJkzf",
+ "123456789",
+
+ "-0123",
+ "-0123-4567",
+ "0123-4567-",
+ "-123-4567",
+ "_0123",
+ "_0123_4567",
+ "0123_4567_",
+ "_123_4567",
+
+ "-abcde-figjk",
+ "abcde-figjk-",
+ "-abcde-figjk-",
+ "_abcde_figjk",
+ "abcde_figjk_",
+ "_abcde_figjk_",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setVariant(ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setVariant(\"%s\") should fail but has no Error\n", ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetUnicodeLocaleKeywordWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_extensions
+ // keyword = key (sep type)? ;
+ // key = alphanum alpha ;
+ // type = alphanum{3,8} (sep alphanum{3,8})* ;
+ static const char* wellFormed_key_value[] = {
+ "aa", "123",
+ "3b", "zyzbcdef",
+ "0Z", "1ZB30zk9-abc",
+ "cZ", "2ck30zfZ-adsf023-234kcZ",
+ "ZZ", "Lant",
+ "ko", "",
+ };
+ for (int i = 0; i < UPRV_LENGTHOF(wellFormed_key_value); i += 2) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setUnicodeLocaleKeyword(wellFormed_key_value[i],
+ wellFormed_key_value[i + 1]);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setUnicodeLocaleKeyword(\"%s\", \"%s\") got Error: %s\n",
+ wellFormed_key_value[i],
+ wellFormed_key_value[i + 1],
+ u_errorName(status));
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetUnicodeLocaleKeywordIllFormedKey() {
+ static const char* illFormed[] = {
+ "34",
+ "ab-cde",
+ "123",
+ "b3",
+ "zyzabcdef",
+ "Z0",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setUnicodeLocaleKeyword(ill, "abc");
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setUnicodeLocaleKeyword(\"%s\", \"abc\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetUnicodeLocaleKeywordIllFormedValue() {
+ static const char* illFormed[] = {
+ "34",
+ "ab-",
+ "-cd",
+ "-ef-",
+ "zyzabcdef",
+ "ab-abc",
+ "1ZB30zfk9-abc",
+ "2ck30zfk9-adsf023-234kcZ",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setUnicodeLocaleKeyword("ab", ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setUnicodeLocaleKeyword(\"ab\", \"%s\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestAddRemoveUnicodeLocaleAttribute() {
+ LocaleBuilder bld;
+ UErrorCode status = U_ZERO_ERROR;
+ Locale loc = bld.setLanguage("fr")
+ .addUnicodeLocaleAttribute("abc")
+ .addUnicodeLocaleAttribute("aBc")
+ .addUnicodeLocaleAttribute("EFG")
+ .addUnicodeLocaleAttribute("efghi")
+ .addUnicodeLocaleAttribute("efgh")
+ .addUnicodeLocaleAttribute("efGhi")
+ .addUnicodeLocaleAttribute("EFg")
+ .addUnicodeLocaleAttribute("hijk")
+ .addUnicodeLocaleAttribute("EFG")
+ .addUnicodeLocaleAttribute("HiJK")
+ .addUnicodeLocaleAttribute("aBc")
+ .build(status);
+ if (U_FAILURE(status)) {
+ errln("addUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ std::string expected("fr-u-abc-efg-efgh-efghi-hijk");
+ std::string actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove "efgh" in the middle with different casing.
+ loc = bld.removeUnicodeLocaleAttribute("eFgH").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ expected = "fr-u-abc-efg-efghi-hijk";
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove non-existing attributes.
+ loc = bld.removeUnicodeLocaleAttribute("efgh").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove "abc" in the beginning with different casing.
+ loc = bld.removeUnicodeLocaleAttribute("ABC").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ expected = "fr-u-efg-efghi-hijk";
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove non-existing substring in the end.
+ loc = bld.removeUnicodeLocaleAttribute("hij").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove "hijk" in the end with different casing.
+ loc = bld.removeUnicodeLocaleAttribute("hIJK").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ expected = "fr-u-efg-efghi";
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove "efghi" in the end with different casing.
+ loc = bld.removeUnicodeLocaleAttribute("EFGhi").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ expected = "fr-u-efg";
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+ // remove "efg" in as the only one, with different casing.
+ loc = bld.removeUnicodeLocaleAttribute("EFG").build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute() got Error: %s\n",
+ u_errorName(status));
+ }
+ expected = "fr";
+ actual = loc.toLanguageTag<std::string>(status);
+ if (U_FAILURE(status) || expected != actual) {
+ errln("Should get \"%s\" but get \"%s\"\n", expected.c_str(), actual.c_str());
+ }
+
+}
+
+void LocaleBuilderTest::TestAddRemoveUnicodeLocaleAttributeWellFormed() {
+ // http://www.unicode.org/reports/tr35/tr35.html#unicode_locale_extensions
+ // attribute = alphanum{3,8} ;
+ static const char* wellFormedAttributes[] = {
+ // alphanum{3}
+ "AbC",
+ "ZAA",
+ "0AA",
+ "x3A",
+ "xa8",
+
+ // alphanum{4}
+ "AbCA",
+ "ZASD",
+ "0ASD",
+ "A3a4",
+ "zK90",
+
+ // alphanum{5}
+ "efgij",
+ "AbCAD",
+ "ZAASD",
+ "0AASD",
+ "A1CAD",
+ "ef2ij",
+ "ads3X",
+ "owqF4",
+
+ // alphanum{6}
+ "efgijk",
+ "AADGFE",
+ "AkDfFz",
+ "0ADGFE",
+ "A9DfFz",
+ "AADG7E",
+
+ // alphanum{7}
+ "asdfads",
+ "ADSFADF",
+ "piSFkDk",
+ "a0dfads",
+ "ADSF3DF",
+ "piSFkD9",
+
+ // alphanum{8}
+ "oieradfz",
+ "IADSFJKR",
+ "kkDSFJkR",
+ };
+ LocaleBuilder bld;
+ for (int i = 0; i < UPRV_LENGTHOF(wellFormedAttributes); i++) {
+ if (i % 5 == 0) {
+ bld.clear();
+ }
+ UErrorCode status = U_ZERO_ERROR;
+ bld.addUnicodeLocaleAttribute(wellFormedAttributes[i]);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("addUnicodeLocaleAttribute(\"%s\") got Error: %s\n",
+ wellFormedAttributes[i], u_errorName(status));
+ }
+ if (i > 2) {
+ bld.removeUnicodeLocaleAttribute(wellFormedAttributes[i - 1]);
+ loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n",
+ wellFormedAttributes[i - 1], u_errorName(status));
+ }
+ bld.removeUnicodeLocaleAttribute(wellFormedAttributes[i - 3]);
+ loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n",
+ wellFormedAttributes[i - 3], u_errorName(status));
+ }
+ }
+ }
+}
+
+void LocaleBuilderTest::TestAddUnicodeLocaleAttributeIllFormed() {
+ static const char* illFormed[] = {
+ "aa",
+ "34",
+ "ab-",
+ "-cd",
+ "-ef-",
+ "zyzabcdef",
+ "123456789",
+ "ab-abc",
+ "1ZB30zfk9-abc",
+ "2ck30zfk9-adsf023-234kcZ",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.addUnicodeLocaleAttribute(ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("addUnicodeLocaleAttribute(\"%s\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetExtensionU() {
+ LocaleBuilder bld;
+ bld.setLanguage("zh");
+ Verify(bld, "zh",
+ "setLanguage(\"zh\") got Error: %s\n");
+
+ bld.setExtension('u', "co-stroke");
+ Verify(bld, "zh-u-co-stroke",
+ "setExtension('u', \"co-stroke\") got Error: %s\n");
+
+ bld.setExtension('U', "ca-islamic");
+ Verify(bld, "zh-u-ca-islamic",
+ "setExtension('U', \"zh-u-ca-islamic\") got Error: %s\n");
+
+ bld.setExtension('u', "ca-chinese");
+ Verify(bld, "zh-u-ca-chinese",
+ "setExtension('u', \"ca-chinese\") got Error: %s\n");
+
+ bld.setExtension('U', "co-pinyin");
+ Verify(bld, "zh-u-co-pinyin",
+ "setExtension('U', \"co-pinyin\") got Error: %s\n");
+
+ bld.setRegion("TW");
+ Verify(bld, "zh-TW-u-co-pinyin",
+ "setRegion(\"TW\") got Error: %s\n");
+
+ bld.setExtension('U', "");
+ Verify(bld, "zh-TW",
+ "setExtension('U', \"\") got Error: %s\n");
+
+ bld.setExtension('u', "abc-defg-kr-face");
+ Verify(bld, "zh-TW-u-abc-defg-kr-face",
+ "setExtension('u', \"abc-defg-kr-face\") got Error: %s\n");
+
+ bld.setExtension('U', "ca-japanese");
+ Verify(bld, "zh-TW-u-ca-japanese",
+ "setExtension('U', \"ca-japanese\") got Error: %s\n");
+
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateUWellFormed() {
+ static const char* wellFormedExtensions[] = {
+ // keyword
+ // keyword = key (sep type)? ;
+ // key = alphanum alpha ;
+ // type = alphanum{3,8} (sep alphanum{3,8})* ;
+ "3A",
+ "ZA",
+ "az-abc",
+ "zz-123",
+ "7z-12345678",
+ "kb-A234567Z",
+ // (sep keyword)+
+ "1z-ZZ",
+ "2z-ZZ-123",
+ "3z-ZZ-123-cd",
+ "0z-ZZ-123-cd-efghijkl",
+ // attribute
+ "abc",
+ "456",
+ "87654321",
+ "ZABADFSD",
+ // (sep attribute)+
+ "abc-ZABADFSD",
+ "123-ZABADFSD",
+ "K2K-12345678",
+ "K2K-12345678-zzz",
+ // (sep attribute)+ (sep keyword)*
+ "K2K-12345678-zz",
+ "K2K-12345678-zz-0z",
+ "K2K-12345678-9z-AZ-abc",
+ "K2K-12345678-zz-9A-234",
+ "K2K-12345678-zk0-abc-efg-zz-9k-234",
+ };
+ for (const char* extension : wellFormedExtensions) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('u', extension);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setExtension('u', \"%s\") got Error: %s\n",
+ extension, u_errorName(status));
+ }
+ };
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateUIllFormed() {
+ static const char* illFormed[] = {
+ // bad key
+ "-",
+ "-ab",
+ "ab-",
+ "abc-",
+ "-abc",
+ "0",
+ "a",
+ "A0",
+ "z9",
+ "09",
+ "90",
+ // bad keyword
+ "AB-A0",
+ "AB-efg-A0",
+ "xy-123456789",
+ "AB-Aa-",
+ "AB-Aac-",
+ // bad attribute
+ "abcdefghi",
+ "abcdefgh-",
+ "abcdefgh-abcdefghi",
+ "abcdefgh-1",
+ "abcdefgh-a",
+ "abcdefgh-a2345678z",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('u', ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setExtension('u', \"%s\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetExtensionT() {
+ LocaleBuilder bld;
+ bld.setLanguage("fr");
+ Verify(bld, "fr",
+ "setLanguage(\"fr\") got Error: %s\n");
+
+ bld.setExtension('T', "zh");
+ Verify(bld, "fr-t-zh",
+ "setExtension('T', \"zh\") got Error: %s\n");
+
+ bld.setExtension('t', "zh-Hant-TW-1234-A9-123-456ABCDE");
+ Verify(bld, "fr-t-zh-hant-tw-1234-a9-123-456abcde",
+ "setExtension('t', \"zh-Hant-TW-1234-A9-123-456ABCDE\") got Error: %s\n");
+
+ bld.setExtension('T', "a9-123");
+ Verify(bld, "fr-t-a9-123",
+ "setExtension('T', \"a9-123\") got Error: %s\n");
+
+ bld.setRegion("MX");
+ Verify(bld, "fr-MX-t-a9-123",
+ "setRegion(\"MX\") got Error: %s\n");
+
+ bld.setScript("Hans");
+ Verify(bld, "fr-Hans-MX-t-a9-123",
+ "setScript(\"Hans\") got Error: %s\n");
+
+ bld.setVariant("9abc-abcde");
+ Verify(bld, "fr-Hans-MX-9abc-abcde-t-a9-123",
+ "setVariant(\"9abc-abcde\") got Error: %s\n");
+
+ bld.setExtension('T', "");
+ Verify(bld, "fr-Hans-MX-9abc-abcde",
+ "bld.setExtension('T', \"\") got Error: %s\n");
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateTWellFormed() {
+ // ((sep tlang (sep tfield)*) | (sep tfield)+)
+ static const char* wellFormedExtensions[] = {
+ // tlang
+ // tlang = unicode_language_subtag (sep unicode_script_subtag)?
+ // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ;
+ // unicode_language_subtag
+ "en",
+ "abc",
+ "abcde",
+ "ABCDEFGH",
+ // unicode_language_subtag sep unicode_script_subtag
+ "en-latn",
+ "abc-arab",
+ "ABCDEFGH-Thai",
+ // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag
+ "en-latn-ME",
+ "abc-arab-RU",
+ "ABCDEFGH-Thai-TH",
+ "en-latn-409",
+ "abc-arab-123",
+ "ABCDEFGH-Thai-456",
+ // unicode_language_subtag sep unicode_region_subtag
+ "en-ME",
+ "abc-RU",
+ "ABCDEFGH-TH",
+ "en-409",
+ "abc-123",
+ "ABCDEFGH-456",
+ // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag
+ // sep (sep unicode_variant_subtag)*
+ "en-latn-ME-abcde",
+ "abc-arab-RU-3abc-abcdef",
+ "ABCDEFGH-Thai-TH-ADSFS-9xyz-abcdef",
+ "en-latn-409-xafsa",
+ "abc-arab-123-ADASDF",
+ "ABCDEFGH-Thai-456-9sdf-ADASFAS",
+ // (sep tfield)+
+ "A0-abcde",
+ "z9-abcde123",
+ "z9-abcde123-a1-abcde",
+ // tlang (sep tfield)*
+ "fr-A0-abcde",
+ "fr-FR-A0-abcde",
+ "fr-123-z9-abcde123-a1-abcde",
+ "fr-Latn-FR-z9-abcde123-a1-abcde",
+ "gab-Thai-TH-abcde-z9-abcde123-a1-abcde",
+ "gab-Thai-TH-0bde-z9-abcde123-a1-abcde",
+ };
+ for (const char* extension : wellFormedExtensions) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('t', extension);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setExtension('t', \"%s\") got Error: %s\n",
+ extension, u_errorName(status));
+ }
+ };
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateTIllFormed() {
+ static const char* illFormed[] = {
+ "a",
+ "a-",
+ "0",
+ "9-",
+ "-9",
+ "-z",
+ // "Latn", // Per 2019-01-23 ICUTC, still accept 4alpha. See ICU-20321
+ "Latn-",
+ "en-",
+ "nob-",
+ "-z9",
+ "a3",
+ "a3-",
+ "3a",
+ "0z-",
+ "en-123-a1",
+ "en-TH-a1",
+ "gab-TH-a1",
+ "gab-Thai-a1",
+ "gab-Thai-TH-a1",
+ "gab-Thai-TH-0bde-a1",
+ "gab-Thai-TH-0bde-3b",
+ "gab-Thai-TH-0bde-z9-a1",
+ "gab-Thai-TH-0bde-z9-3b",
+ "gab-Thai-TH-0bde-z9-abcde123-3b",
+ "gab-Thai-TH-0bde-z9-abcde123-ab",
+ "gab-Thai-TH-0bde-z9-abcde123-ab",
+ "gab-Thai-TH-0bde-z9-abcde123-a1",
+ "gab-Thai-TH-0bde-z9-abcde123-a1-",
+ "gab-Thai-TH-0bde-z9-abcde123-a1-a",
+ "gab-Thai-TH-0bde-z9-abcde123-a1-ab",
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('t', ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setExtension('t', \"%s\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetExtensionPU() {
+ LocaleBuilder bld;
+ bld.setLanguage("ar");
+ Verify(bld, "ar",
+ "setLanguage(\"ar\") got Error: %s\n");
+
+ bld.setExtension('X', "a-b-c-d-e");
+ Verify(bld, "ar-x-a-b-c-d-e",
+ "setExtension('X', \"a-b-c-d-e\") got Error: %s\n");
+
+ bld.setExtension('x', "0-1-2-3");
+ Verify(bld, "ar-x-0-1-2-3",
+ "setExtension('x', \"0-1-2-3\") got Error: %s\n");
+
+ bld.setExtension('X', "0-12345678-x-x");
+ Verify(bld, "ar-x-0-12345678-x-x",
+ "setExtension('x', \"ar-x-0-12345678-x-x\") got Error: %s\n");
+
+ bld.setRegion("TH");
+ Verify(bld, "ar-TH-x-0-12345678-x-x",
+ "setRegion(\"TH\") got Error: %s\n");
+
+ bld.setExtension('X', "");
+ Verify(bld, "ar-TH",
+ "setExtension(\"X\") got Error: %s\n");
+}
+
+void LocaleBuilderTest::TestSetExtensionValidatePUWellFormed() {
+ // ((sep tlang (sep tfield)*) | (sep tfield)+)
+ static const char* wellFormedExtensions[] = {
+ "a", // Short subtag
+ "z", // Short subtag
+ "0", // Short subtag, digit
+ "9", // Short subtag, digit
+ "a-0", // Two short subtag, alpha and digit
+ "9-z", // Two short subtag, digit and alpha
+ "ab",
+ "abc",
+ "abcefghi", // Long subtag
+ "87654321",
+ "01",
+ "234",
+ "0a-ab-87654321", // Three subtags
+ "87654321-ab-00-3A", // Four subtabs
+ "a-9-87654321", // Three subtags with short and long subtags
+ "87654321-ab-0-3A",
+ };
+ for (const char* extension : wellFormedExtensions) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('x', extension);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setExtension('x', \"%s\") got Error: %s\n",
+ extension, u_errorName(status));
+ }
+ };
+}
+
+void LocaleBuilderTest::TestSetExtensionValidatePUIllFormed() {
+ static const char* illFormed[] = {
+ "123456789", // Too long
+ "abcdefghi", // Too long
+ "ab-123456789", // Second subtag too long
+ "abcdefghi-12", // First subtag too long
+ "a-ab-987654321", // Third subtag too long
+ "987654321-a-0-3", // First subtag too long
+ };
+ for (const char* ill : illFormed) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension('x', ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setExtension('x', \"%s\") should fail but has no Error\n",
+ ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetExtensionOthers() {
+ LocaleBuilder bld;
+ bld.setLanguage("fr");
+ Verify(bld, "fr",
+ "setLanguage(\"fr\") got Error: %s\n");
+
+ bld.setExtension('Z', "ab");
+ Verify(bld, "fr-z-ab",
+ "setExtension('Z', \"ab\") got Error: %s\n");
+
+ bld.setExtension('0', "xyz12345-abcdefg");
+ Verify(bld, "fr-0-xyz12345-abcdefg-z-ab",
+ "setExtension('0', \"xyz12345-abcdefg\") got Error: %s\n");
+
+ bld.setExtension('a', "01-12345678-ABcdef");
+ Verify(bld, "fr-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab",
+ "setExtension('a', \"01-12345678-ABcdef\") got Error: %s\n");
+
+ bld.setRegion("TH");
+ Verify(bld, "fr-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab",
+ "setRegion(\"TH\") got Error: %s\n");
+
+ bld.setScript("Arab");
+ Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab",
+ "setRegion(\"Arab\") got Error: %s\n");
+
+ bld.setExtension('A', "97");
+ Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-97-z-ab",
+ "setExtension('a', \"97\") got Error: %s\n");
+
+ bld.setExtension('a', "");
+ Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-z-ab",
+ "setExtension('a', \"\") got Error: %s\n");
+
+ bld.setExtension('0', "");
+ Verify(bld, "fr-Arab-TH-z-ab",
+ "setExtension('0', \"\") got Error: %s\n");
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateOthersWellFormed() {
+ static const char* wellFormedExtensions[] = {
+ "ab",
+ "abc",
+ "abcefghi",
+ "01",
+ "234",
+ "87654321",
+ "0a-ab-87654321",
+ "87654321-ab-00-3A",
+ };
+
+ const char * aToZ = "abcdefghijklmnopqrstuvwxyz";
+ const int32_t aToZLen = uprv_strlen(aToZ);
+ int32_t i = 0;
+ for (const char* extension : wellFormedExtensions) {
+ char ch = aToZ[i];
+ i = (i + 1) % aToZLen;
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension(ch, extension);
+ Locale loc = bld.build(status);
+ if (U_FAILURE(status)) {
+ errln("setExtension('%c', \"%s\") got Error: %s\n",
+ ch, extension, u_errorName(status));
+ }
+ };
+
+ const char* someChars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789`~!@#$%^&*()-_=+;:,.<>?";
+ const int32_t someCharsLen = uprv_strlen(someChars);
+ for (int32_t i = 0; i < someCharsLen; i++) {
+ char ch = someChars[i];
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension(ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)]);
+ Locale loc = bld.build(status);
+ if (uprv_isASCIILetter(ch) || ('0' <= ch && ch <= '9')) {
+ if (ch != 't' && ch != 'T' && ch != 'u' && ch != 'U' && ch != 'x' && ch != 'X') {
+ if (U_FAILURE(status)) {
+ errln("setExtension('%c', \"%s\") got Error: %s\n",
+ ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)], u_errorName(status));
+ }
+ }
+ } else {
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setExtension('%c', \"%s\") should fail but has no Error\n",
+ ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)]);
+ }
+ }
+
+ }
+}
+
+void LocaleBuilderTest::TestSetExtensionValidateOthersIllFormed() {
+ static const char* illFormed[] = {
+ "0", // Too short
+ "a", // Too short
+ "123456789", // Too long
+ "abcdefghi", // Too long
+ "ab-123456789", // Second subtag too long
+ "abcdefghi-12", // First subtag too long
+ "a-ab-87654321", // Third subtag too long
+ "87654321-a-0-3", // First subtag too long
+ };
+ const char * aToZ = "abcdefghijklmnopqrstuvwxyz";
+ const int32_t aToZLen = uprv_strlen(aToZ);
+ int32_t i = 0;
+ for (const char* ill : illFormed) {
+ char ch = aToZ[i];
+ i = (i + 1) % aToZLen;
+ UErrorCode status = U_ZERO_ERROR;
+ LocaleBuilder bld;
+ bld.setExtension(ch, ill);
+ Locale loc = bld.build(status);
+ if (status != U_ILLEGAL_ARGUMENT_ERROR) {
+ errln("setExtension('%c', \"%s\") should fail but has no Error\n",
+ ch, ill);
+ }
+ }
+}
+
+void LocaleBuilderTest::TestSetLocale() {
+ LocaleBuilder bld1, bld2;
+ UErrorCode status = U_ZERO_ERROR;
+ Locale l1 = bld1.setLanguage("en")
+ .setScript("Latn")
+ .setRegion("MX")
+ .setVariant("3456-abcde")
+ .addUnicodeLocaleAttribute("456")
+ .addUnicodeLocaleAttribute("123")
+ .setUnicodeLocaleKeyword("nu", "thai")
+ .setUnicodeLocaleKeyword("co", "stroke")
+ .setUnicodeLocaleKeyword("ca", "chinese")
+ .build(status);
+ if (U_FAILURE(status) || l1.isBogus()) {
+ errln("build got Error: %s\n", u_errorName(status));
+ }
+ status = U_ZERO_ERROR;
+ Locale l2 = bld1.setLocale(l1).build(status);
+ if (U_FAILURE(status) || l2.isBogus()) {
+ errln("build got Error: %s\n", u_errorName(status));
+ }
+
+ if (l1 != l2) {
+ errln("Two locales should be the same, but one is '%s' and the other is '%s'",
+ l1.getName(), l2.getName());
+ }
+}
+
+void LocaleBuilderTest::TestPosixCases() {
+ UErrorCode status = U_ZERO_ERROR;
+ Locale l1 = Locale::forLanguageTag("en-US-u-va-posix", status);
+ if (U_FAILURE(status) || l1.isBogus()) {
+ errln("build got Error: %s\n", u_errorName(status));
+ }
+ LocaleBuilder bld;
+ bld.setLanguage("en")
+ .setRegion("MX")
+ .setScript("Arab")
+ .setUnicodeLocaleKeyword("nu", "Thai")
+ .setExtension('x', "1");
+ // All of above should be cleared by the setLocale call.
+ Locale l2 = bld.setLocale(l1).build(status);
+ if (U_FAILURE(status) || l2.isBogus()) {
+ errln("build got Error: %s\n", u_errorName(status));
+ }
+ if (l1 != l2) {
+ errln("The result locale should be the set as the setLocale %s but got %s\n",
+ l1.toLanguageTag<std::string>(status).c_str(),
+ l2.toLanguageTag<std::string>(status).c_str());
+ }
+ Locale posix("en-US-POSIX");
+ if (posix != l2) {
+ errln("The result locale should be the set as %s but got %s\n",
+ posix.getName(), l2.getName());
+ }
+}