From 0ffd26301d70a90c01cd8b2cb1cd200c4b0a7061 Mon Sep 17 00:00:00 2001 From: Yoshito Umaoka Date: Thu, 28 Aug 2014 01:27:49 +0000 Subject: [PATCH] ICU-8951 Legacy/BCP 47 keyword conversion APIs merged into ICU4J trunk. X-SVN-Rev: 36261 --- .gitattributes | 1 + .../com/ibm/icu/impl/locale/KeyTypeData.java | 542 ++++++++++++++++++ .../impl/locale/UnicodeLocaleExtension.java | 21 +- .../core/src/com/ibm/icu/util/ULocale.java | 311 +++++----- .../ibm/icu/dev/test/util/ULocaleTest.java | 106 +++- 5 files changed, 818 insertions(+), 163 deletions(-) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java diff --git a/.gitattributes b/.gitattributes index e6f7aa7e4ca..ead94d5644a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -268,6 +268,7 @@ icu4j/main/classes/core/.settings/org.eclipse.core.resources.prefs -text icu4j/main/classes/core/.settings/org.eclipse.jdt.core.prefs -text icu4j/main/classes/core/manifest.stub -text icu4j/main/classes/core/src/com/ibm/icu/impl/TZDBTimeZoneNames.java -text +icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java -text icu4j/main/classes/currdata/.externalToolBuilders/copy-data-currdata.launch -text icu4j/main/classes/currdata/.settings/org.eclipse.core.resources.prefs -text icu4j/main/classes/currdata/.settings/org.eclipse.jdt.core.prefs -text diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java new file mode 100644 index 00000000000..ce4ceb4f974 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java @@ -0,0 +1,542 @@ +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.impl.locale; + +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.Set; +import java.util.regex.Pattern; + +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.UResourceBundle; +import com.ibm.icu.util.UResourceBundleIterator; + +/** + */ +public class KeyTypeData { + + private static abstract class SpecialTypeHandler { + abstract boolean isValid(String value); + String canonicalize(String value) { + return AsciiUtil.toLowerString(value); + } + } + + private static class CodepointsTypeHandler extends SpecialTypeHandler { + private static final Pattern pat = Pattern.compile("[0-9a-fA-F]{4,6}(-[0-9a-fA-F]{4,6})*"); + boolean isValid(String value) { + return pat.matcher(value).matches(); + } + } + + private static class ReorderCodeTypeHandler extends SpecialTypeHandler { + private static final Pattern pat = Pattern.compile("[a-zA-Z]{3,8}(-[a-zA-Z]{3,8})*"); + boolean isValid(String value) { + return pat.matcher(value).matches(); + } + } + + private enum SpecialType { + CODEPOINTS(new CodepointsTypeHandler()), + REORDER_CODE(new ReorderCodeTypeHandler()); + + SpecialTypeHandler handler; + SpecialType(SpecialTypeHandler handler) { + this.handler = handler; + } + }; + + private static class KeyData { + String legacyId; + String bcpId; + Map typeMap; + EnumSet specialTypes; + + KeyData(String legacyId, String bcpId, Map typeMap, + EnumSet specialTypes) { + this.legacyId = legacyId; + this.bcpId = bcpId; + this.typeMap = typeMap; + this.specialTypes = specialTypes; + } + } + + private static class Type { + String legacyId; + String bcpId; + + Type(String legacyId, String bcpId) { + this.legacyId = legacyId; + this.bcpId = bcpId; + } + } + + public static String toBcpKey(String key) { + key = AsciiUtil.toLowerString(key); + KeyData keyData = KEYMAP.get(key); + if (keyData != null) { + return keyData.bcpId; + } + return null; + } + + public static String toLegacyKey(String key) { + key = AsciiUtil.toLowerString(key); + KeyData keyData = KEYMAP.get(key); + if (keyData != null) { + return keyData.legacyId; + } + return null; + } + + public static String toBcpType(String key, String type, + Output isKnownKey, Output isSpecialType) { + + if (isKnownKey != null) { + isKnownKey.value = false; + } + if (isSpecialType != null) { + isSpecialType.value = false; + } + + key = AsciiUtil.toLowerString(key); + type = AsciiUtil.toLowerString(type); + + KeyData keyData = KEYMAP.get(key); + if (keyData != null) { + if (isKnownKey != null) { + isKnownKey.value = Boolean.TRUE; + } + Type t = keyData.typeMap.get(type); + if (t != null) { + return t.bcpId; + } + if (keyData.specialTypes != null) { + for (SpecialType st : keyData.specialTypes) { + if (st.handler.isValid(type)) { + if (isSpecialType != null) { + isSpecialType.value = true; + } + return st.handler.canonicalize(type); + } + } + } + } + return null; + } + + + public static String toLegacyType(String key, String type, + Output isKnownKey, Output isSpecialType) { + + if (isKnownKey != null) { + isKnownKey.value = false; + } + if (isSpecialType != null) { + isSpecialType.value = false; + } + + key = AsciiUtil.toLowerString(key); + type = AsciiUtil.toLowerString(type); + + KeyData keyData = KEYMAP.get(key); + if (keyData != null) { + if (isKnownKey != null) { + isKnownKey.value = Boolean.TRUE; + } + Type t = keyData.typeMap.get(type); + if (t != null) { + return t.legacyId; + } + if (keyData.specialTypes != null) { + for (SpecialType st : keyData.specialTypes) { + if (st.handler.isValid(type)) { + if (isSpecialType != null) { + isSpecialType.value = true; + } + return st.handler.canonicalize(type); + } + } + } + } + return null; + } + + + private static void initFromResourceBundle() { + UResourceBundle keyTypeDataRes = UResourceBundle.getBundleInstance( + ICUResourceBundle.ICU_BASE_NAME, + "keyTypeData", + ICUResourceBundle.ICU_DATA_CLASS_LOADER); + UResourceBundle keyMapRes = keyTypeDataRes.get("keyMap"); + UResourceBundle typeMapRes = keyTypeDataRes.get("typeMap"); + + // alias data is optional + UResourceBundle typeAliasRes = null; + UResourceBundle bcpTypeAliasRes = null; + + try { + typeAliasRes = keyTypeDataRes.get("typeAlias"); + } catch (MissingResourceException e) { + // fall through + } + + try { + bcpTypeAliasRes = keyTypeDataRes.get("bcpTypeAlias"); + } catch (MissingResourceException e) { + // fall through + } + + // iterate through keyMap resource + UResourceBundleIterator keyMapItr = keyMapRes.getIterator(); + while (keyMapItr.hasNext()) { + UResourceBundle keyMapEntry = keyMapItr.next(); + String legacyKeyId = keyMapEntry.getKey(); + String bcpKeyId = keyMapEntry.getString(); + + boolean hasSameKey = false; + if (bcpKeyId.length() == 0) { + // Empty value indicates that BCP key is same with the legacy key. + bcpKeyId = legacyKeyId; + hasSameKey = true; + } + + boolean isTZ = legacyKeyId.equals("timezone"); + + // reverse type alias map + Map> typeAliasMap = null; + if (typeAliasRes != null) { + UResourceBundle typeAliasResByKey = null; + try { + typeAliasResByKey = typeAliasRes.get(legacyKeyId); + } catch (MissingResourceException e) { + // fall through + } + if (typeAliasResByKey != null) { + typeAliasMap = new HashMap>(); + UResourceBundleIterator typeAliasResItr = typeAliasResByKey.getIterator(); + while (typeAliasResItr.hasNext()) { + UResourceBundle typeAliasDataEntry = typeAliasResItr.next(); + String from = typeAliasDataEntry.getKey(); + String to = typeAliasDataEntry.getString(); + if (isTZ) { + from = from.replace(':', '/'); + } + Set aliasSet = typeAliasMap.get(to); + if (aliasSet == null) { + aliasSet = new HashSet(); + typeAliasMap.put(to, aliasSet); + } + aliasSet.add(from); + } + } + } + + // reverse bcp type alias map + Map> bcpTypeAliasMap = null; + if (bcpTypeAliasRes != null) { + UResourceBundle bcpTypeAliasResByKey = null; + try { + bcpTypeAliasResByKey = bcpTypeAliasRes.get(bcpKeyId); + } catch (MissingResourceException e) { + // fall through + } + if (bcpTypeAliasResByKey != null) { + bcpTypeAliasMap = new HashMap>(); + UResourceBundleIterator bcpTypeAliasResItr = bcpTypeAliasResByKey.getIterator(); + while (bcpTypeAliasResItr.hasNext()) { + UResourceBundle bcpTypeAliasDataEntry = bcpTypeAliasResItr.next(); + String from = bcpTypeAliasDataEntry.getKey(); + String to = bcpTypeAliasDataEntry.getString(); + Set aliasSet = bcpTypeAliasMap.get(to); + if (aliasSet == null) { + aliasSet = new HashSet(); + bcpTypeAliasMap.put(to, aliasSet); + } + aliasSet.add(from); + } + } + } + + Map typeDataMap = new HashMap(); + Set specialTypeSet = null; + + // look up type map for the key, and walk through the mapping data + UResourceBundle typeMapResByKey = null; + try { + typeMapResByKey = typeMapRes.get(legacyKeyId); + } catch (MissingResourceException e) { + // type map for each key must exist + assert false; + } + if (typeMapResByKey != null) { + UResourceBundleIterator typeMapResByKeyItr = typeMapResByKey.getIterator(); + while (typeMapResByKeyItr.hasNext()) { + UResourceBundle typeMapEntry = typeMapResByKeyItr.next(); + String legacyTypeId = typeMapEntry.getKey(); + + // special types + boolean isSpecialType = false; + for (SpecialType st : SpecialType.values()) { + if (legacyTypeId.equals(st.toString())) { + isSpecialType = true; + if (specialTypeSet == null) { + specialTypeSet = new HashSet(); + } + specialTypeSet.add(st); + break; + } + } + if (isSpecialType) { + continue; + } + + if (isTZ) { + // a timezone key uses a colon instead of a slash in the resource. + // e.g. America:Los_Angeles + legacyTypeId = legacyTypeId.replace(':', '/'); + } + + String bcpTypeId = typeMapEntry.getString(); + + boolean hasSameType = false; + if (bcpTypeId.length() == 0) { + // Empty value indicates that BCP type is same with the legacy type. + bcpTypeId = legacyTypeId; + hasSameType = true; + } + + // Note: legacy type value should never be + // equivalent to bcp type value of a different + // type under the same key. So we use a single + // map for lookup. + Type t = new Type(legacyTypeId, bcpTypeId); + typeDataMap.put(AsciiUtil.toLowerString(legacyTypeId), t); + if (!hasSameType) { + typeDataMap.put(AsciiUtil.toLowerString(bcpTypeId), t); + } + + // Also put aliases in the map + if (typeAliasMap != null) { + Set typeAliasSet = typeAliasMap.get(legacyTypeId); + if (typeAliasSet != null) { + for (String alias : typeAliasSet) { + typeDataMap.put(AsciiUtil.toLowerString(alias), t); + } + } + } + if (bcpTypeAliasMap != null) { + Set bcpTypeAliasSet = bcpTypeAliasMap.get(bcpTypeId); + if (bcpTypeAliasSet != null) { + for (String alias : bcpTypeAliasSet) { + typeDataMap.put(AsciiUtil.toLowerString(alias), t); + } + } + } + } + } + + EnumSet specialTypes = null; + if (specialTypeSet != null) { + specialTypes = EnumSet.copyOf(specialTypeSet); + } + + KeyData keyData = new KeyData(legacyKeyId, bcpKeyId, typeDataMap, specialTypes); + + KEYMAP.put(AsciiUtil.toLowerString(legacyKeyId), keyData); + if (!hasSameKey) { + KEYMAP.put(AsciiUtil.toLowerString(bcpKeyId), keyData); + } + } + } + + // + // Note: The key-type data is currently read from ICU resource bundle keyTypeData.res. + // In future, we may import the data into code like below directly from CLDR to + // avoid cyclic dependency between ULocale and UResourceBundle. For now, the code + // below is just for proof of concept, and commented out. + // + +// private static final String[][] TYPE_DATA_CA = { +// // {, }, +// {"buddhist", null}, +// {"chinese", null}, +// {"coptic", null}, +// {"dangi", null}, +// {"ethiopic", null}, +// {"ethiopic-amete-alem", "ethioaa"}, +// {"gregorian", "gregory"}, +// {"hebrew", null}, +// {"indian", null}, +// {"islamic", null}, +// {"islamic-civil", null}, +// {"islamic-rgsa", null}, +// {"islamic-tbla", null}, +// {"islamic-umalqura", null}, +// {"iso8601", null}, +// {"japanese", null}, +// {"persian", null}, +// {"roc", null}, +// }; +// +// private static final String[][] TYPE_DATA_KS = { +// // {, }, +// {"identical", "identic"}, +// {"primary", "level1"}, +// {"quaternary", "level4"}, +// {"secondary", "level2"}, +// {"tertiary", "level3"}, +// }; +// +// private static final String[][] TYPE_ALIAS_KS = { +// // {, }, +// {"quarternary", "quaternary"}, +// }; +// +// private static final String[][] BCP_TYPE_ALIAS_CA = { +// // {, +// {"islamicc", "islamic-civil"}, +// }; +// +// private static final Object[][] KEY_DATA = { +// // {, , , , }, +// {"calendar", "ca", TYPE_DATA_CA, null, BCP_TYPE_ALIAS_CA}, +// {"colstrength", "ks", TYPE_DATA_KS, TYPE_ALIAS_KS, null}, +// }; + + private static final Object[][] KEY_DATA = {}; + + @SuppressWarnings("unused") + private static void initFromTables() { + for (Object[] keyDataEntry : KEY_DATA) { + String legacyKeyId = (String)keyDataEntry[0]; + String bcpKeyId = (String)keyDataEntry[1]; + String[][] typeData = (String[][])keyDataEntry[2]; + String[][] typeAliasData = (String[][])keyDataEntry[3]; + String[][] bcpTypeAliasData = (String[][])keyDataEntry[4]; + + boolean hasSameKey = false; + if (bcpKeyId == null) { + bcpKeyId = legacyKeyId; + hasSameKey = true; + } + + // reverse type alias map + Map> typeAliasMap = null; + if (typeAliasData != null) { + typeAliasMap = new HashMap>(); + for (String[] typeAliasDataEntry : typeAliasData) { + String from = typeAliasDataEntry[0]; + String to = typeAliasDataEntry[1]; + Set aliasSet = typeAliasMap.get(to); + if (aliasSet == null) { + aliasSet = new HashSet(); + typeAliasMap.put(to, aliasSet); + } + aliasSet.add(from); + } + } + + // BCP type alias map data + Map> bcpTypeAliasMap = null; + if (bcpTypeAliasData != null) { + bcpTypeAliasMap = new HashMap>(); + for (String[] bcpTypeAliasDataEntry : bcpTypeAliasData) { + String from = bcpTypeAliasDataEntry[0]; + String to = bcpTypeAliasDataEntry[1]; + Set aliasSet = bcpTypeAliasMap.get(to); + if (aliasSet == null) { + aliasSet = new HashSet(); + bcpTypeAliasMap.put(to, aliasSet); + } + aliasSet.add(from); + } + } + + // Type map data + assert typeData != null; + Map typeDataMap = new HashMap(); + Set specialTypeSet = null; + + for (String[] typeDataEntry : typeData) { + String legacyTypeId = typeDataEntry[0]; + String bcpTypeId = typeDataEntry[1]; + + // special types + boolean isSpecialType = false; + for (SpecialType st : SpecialType.values()) { + if (legacyTypeId.equals(st.toString())) { + isSpecialType = true; + if (specialTypeSet == null) { + specialTypeSet = new HashSet(); + } + specialTypeSet.add(st); + break; + } + } + if (isSpecialType) { + continue; + } + + boolean hasSameType = false; + if (bcpTypeId == null) { + bcpTypeId = legacyTypeId; + hasSameType = true; + } + + // Note: legacy type value should never be + // equivalent to bcp type value of a different + // type under the same key. So we use a single + // map for lookup. + Type t = new Type(legacyTypeId, bcpTypeId); + typeDataMap.put(AsciiUtil.toLowerString(legacyTypeId), t); + if (!hasSameType) { + typeDataMap.put(AsciiUtil.toLowerString(bcpTypeId), t); + } + + // Also put aliases in the index + Set typeAliasSet = typeAliasMap.get(legacyTypeId); + if (typeAliasSet != null) { + for (String alias : typeAliasSet) { + typeDataMap.put(AsciiUtil.toLowerString(alias), t); + } + } + Set bcpTypeAliasSet = bcpTypeAliasMap.get(bcpTypeId); + if (bcpTypeAliasSet != null) { + for (String alias : bcpTypeAliasSet) { + typeDataMap.put(AsciiUtil.toLowerString(alias), t); + } + } + } + + EnumSet specialTypes = null; + if (specialTypeSet != null) { + specialTypes = EnumSet.copyOf(specialTypeSet); + } + + KeyData keyData = new KeyData(legacyKeyId, bcpKeyId, typeDataMap, specialTypes); + + KEYMAP.put(AsciiUtil.toLowerString(legacyKeyId), keyData); + if (!hasSameKey) { + KEYMAP.put(AsciiUtil.toLowerString(bcpKeyId), keyData); + } + } + } + + private static final Map KEYMAP; + + static { + KEYMAP = new HashMap(); +// initFromTables(); + initFromResourceBundle(); + } + +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/UnicodeLocaleExtension.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/UnicodeLocaleExtension.java index 5a3e3f35499..96b50b74ddc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/UnicodeLocaleExtension.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/UnicodeLocaleExtension.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2009-2010, International Business Machines Corporation and * + * Copyright (C) 2009-2014, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -99,4 +99,23 @@ public class UnicodeLocaleExtension extends Extension { // 3*8alphanum return (s.length() >= 3) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s); } + + public static boolean isType(String s) { + // sequence of type subtags delimited by '-' + int startIdx = 0; + boolean sawSubtag = false; + while (true) { + int idx = s.indexOf(LanguageTag.SEP, startIdx); + String subtag = idx < 0 ? s.substring(startIdx) : s.substring(startIdx, idx); + if (!isTypeSubtag(subtag)) { + return false; + } + sawSubtag = true; + if (idx < 0) { + break; + } + startIdx = idx + 1; + } + return sawSubtag && startIdx < s.length(); + } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java index b9c5c01538f..bc57266c4fd 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java @@ -40,6 +40,7 @@ import com.ibm.icu.impl.locale.LocaleExtensions; import com.ibm.icu.impl.locale.LocaleSyntaxException; import com.ibm.icu.impl.locale.ParseStatus; import com.ibm.icu.impl.locale.UnicodeLocaleExtension; +import com.ibm.icu.impl.locale.KeyTypeData; import com.ibm.icu.text.LocaleDisplayNames; import com.ibm.icu.text.LocaleDisplayNames.DialectHandling; @@ -3218,6 +3219,149 @@ public final class ULocale implements Serializable, Comparable { return getInstance(bldr.getBaseLocale(), bldr.getLocaleExtensions()); } + /** + * Converts the specified keyword (legacy key, or BCP 47 Unicode locale + * extension key) to the equivalent BCP 47 Unicode locale extension key. + * For example, BCP 47 Unicode locale extension key "co" is returned for + * the input keyword "collation". + *

+ * When the specified keyword is unknown, but satisfies the BCP syntax, + * then the lower-case version of the input keyword will be returned. + * For example, + * toUnicodeLocaleKey("ZZ") returns "zz". + * + * @param keyword the input locale keyword (either legacy key + * such as "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @return the well-formed BCP 47 Unicode locale extension key, + * or null if the specified locale keyword cannot be mapped + * to a well-formed BCP 47 Unicode locale extension key. + * @see #toLegacyKey(String) + * @draft ICU 54 + * @provisional This API might change or be removed in a future release. + */ + public static String toUnicodeLocaleKey(String keyword) { + String uniLocKey = KeyTypeData.toBcpKey(keyword); + if (uniLocKey == null && UnicodeLocaleExtension.isKey(keyword)) { + // unknown keyword, but syntax is fine.. + uniLocKey = AsciiUtil.toLowerString(keyword); + } + return uniLocKey; + } + + /** + * Converts the specified keyword value (legacy type, or BCP 47 + * Unicode locale extension type) to the well-formed BCP 47 Unicode locale + * extension type for the specified keyword (category). For example, BCP 47 + * Unicode locale extension type "phonebk" is returned for the input + * keyword value "phonebook", with the keyword "collation" (or "co"). + *

+ * When the specified keyword is not recognized, but the specified value + * satisfies the syntax of the BCP 47 Unicode locale extension type, + * or when the specified keyword allows 'variable' type and the specified + * value satisfies the syntax, the lower-case version of the input value + * will be returned. For example, + * toUnicodeLocaleType("Foo", "Bar") returns "bar", + * toUnicodeLocaleType("variableTop", "00A4") returns "00a4". + * + * @param keyword the locale keyword (either legacy key such as + * "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @param value the locale keyword value (either legacy type + * such as "phonebook" or BCP 47 Unicode locale extension + * type such as "phonebk"). + * @return the well-formed BCP47 Unicode locale extension type, + * or null if the locale keyword value cannot be mapped to + * a well-formed BCP 47 Unicode locale extension type. + * @see #toLegacyType(String, String) + * @draft ICU 54 + * @provisional This API might change or be removed in a future release. + */ + public static String toUnicodeLocaleType(String keyword, String value) { + String bcpType = KeyTypeData.toBcpType(keyword, value, null, null); + if (bcpType == null && UnicodeLocaleExtension.isType(value)) { + // unknown keyword, but syntax is fine.. + bcpType = AsciiUtil.toLowerString(value); + } + return bcpType; + } + + /** + * Converts the specified keyword (BCP 47 Unicode locale extension key, or + * legacy key) to the legacy key. For example, legacy key "collation" is + * returned for the input BCP 47 Unicode locale extension key "co". + * + * @param keyword the input locale keyword (either BCP 47 Unicode locale + * extension key or legacy key). + * @return the well-formed legacy key, or null if the specified + * keyword cannot be mapped to a well-formed legacy key. + * @see #toUnicodeLocaleKey(String) + * @draft ICU 54 + * @provisional This API might change or be removed in a future release. + */ + public static String toLegacyKey(String keyword) { + String legacyKey = KeyTypeData.toLegacyKey(keyword); + if (legacyKey == null) { + // Checks if the specified locale key is well-formed with the legacy locale syntax. + // + // Note: + // Neither ICU nor LDML/CLDR provides the definition of keyword syntax. + // However, a key should not contain '=' obviously. For now, all existing + // keys are using ASCII alphabetic letters only. We won't add any new key + // that is not compatible with the BCP 47 syntax. Therefore, we assume + // a valid key consist from [0-9a-zA-Z], no symbols. + if (keyword.matches("[0-9a-zA-Z]*")) { + legacyKey = AsciiUtil.toLowerString(keyword); + } + } + return legacyKey; + } + + /** + * Converts the specified keyword value (BCP 47 Unicode locale extension type, + * or legacy type or type alias) to the canonical legacy type. For example, + * the legacy type "phonebook" is returned for the input BCP 47 Unicode + * locale extension type "phonebk" with the keyword "collation" (or "co"). + *

+ * When the specified keyword is not recognized, but the specified value + * satisfies the syntax of legacy key, or when the specified keyword + * allows 'variable' type and the specified value satisfies the syntax, + * the lower-case version of the input value will be returned. + * For example, + * toLegacyType("Foo", "Bar") returns "bar", + * toLegacyType("vt", "00A4") returns "00a4". + * + * @param keyword the locale keyword (either legacy keyword such as + * "collation" or BCP 47 Unicode locale extension + * key such as "co"). + * @param value the locale keyword value (either BCP 47 Unicode locale + * extension type such as "phonebk" or legacy keyword value + * such as "phonebook"). + * @return the well-formed legacy type, or null if the specified + * keyword value cannot be mapped to a well-formed legacy + * type. + * @see #toUnicodeLocaleType(String, String) + * @draft ICU 54 + * @provisional This API might change or be removed in a future release. + */ + public static String toLegacyType(String keyword, String value) { + String legacyType = KeyTypeData.toLegacyType(keyword, value, null, null); + if (legacyType == null) { + // Checks if the specified locale type is well-formed with the legacy locale syntax. + // + // Note: + // Neither ICU nor LDML/CLDR provides the definition of keyword syntax. + // However, a type should not contain '=' obviously. For now, all existing + // types are using ASCII alphabetic letters with a few symbol letters. We won't + // add any new type that is not compatible with the BCP 47 syntax except timezone + // IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain + // '-' '_' '/' in the middle. + if (value.matches("[0-9a-zA-Z]+([_/\\-][0-9a-zA-Z]+)*")) { + legacyType = AsciiUtil.toLowerString(value); + } + } + return legacyType; + } /** * Builder is used to build instances of ULocale @@ -3591,8 +3735,8 @@ public final class ULocale implements Serializable, Comparable { for (String bcpKey : ukeys) { String bcpType = uext.getUnicodeLocaleType(bcpKey); // convert to legacy key/type - String lkey = bcp47ToLDMLKey(bcpKey); - String ltype = bcp47ToLDMLType(lkey, ((bcpType.length() == 0) ? "yes" : bcpType)); // use "yes" as the value of typeless keywords + String lkey = toLegacyKey(bcpKey); + String ltype = toLegacyType(bcpKey, ((bcpType.length() == 0) ? "yes" : bcpType)); // use "yes" as the value of typeless keywords // special handling for u-va-posix, since this is a variant, not a keyword if (lkey.equals("va") && ltype.equals("posix") && base.getVariant().length() == 0) { id = id + "_POSIX"; @@ -3675,8 +3819,8 @@ public final class ULocale implements Serializable, Comparable { } } } else if (key.length() >= 2) { - String bcpKey = ldmlKeyToBCP47(key); - String bcpType = ldmlTypeToBCP47(key, getKeywordValue(key)); + String bcpKey = toUnicodeLocaleKey(key); + String bcpType = toUnicodeLocaleType(key, getKeywordValue(key)); if (bcpKey != null && bcpType != null) { try { intbld.setUnicodeLocaleKeyword(bcpKey, bcpType); @@ -3699,161 +3843,6 @@ public final class ULocale implements Serializable, Comparable { return extensions; } - // - // LDML legacy/BCP47 key and type mapping functions - // - private static String ldmlKeyToBCP47(String key) { - UResourceBundle keyTypeData = UResourceBundle.getBundleInstance( - ICUResourceBundle.ICU_BASE_NAME, - "keyTypeData", - ICUResourceBundle.ICU_DATA_CLASS_LOADER); - UResourceBundle keyMap = keyTypeData.get("keyMap"); - - // normalize key to lowercase - key = AsciiUtil.toLowerString(key); - String bcpKey = null; - try { - bcpKey = keyMap.getString(key); - if (bcpKey.length() == 0) { - // empty value indicates the BCP47 key is same with the legacy key - bcpKey = key; - } - } catch (MissingResourceException mre) { - // fall through - } - - if (bcpKey == null) { - if (key.length() == 2 && LanguageTag.isExtensionSubtag(key)) { - return key; - } - return null; - } - return bcpKey; - } - - private static String bcp47ToLDMLKey(String bcpKey) { - UResourceBundle keyTypeData = UResourceBundle.getBundleInstance( - ICUResourceBundle.ICU_BASE_NAME, - "keyTypeData", - ICUResourceBundle.ICU_DATA_CLASS_LOADER); - UResourceBundle keyMap = keyTypeData.get("keyMap"); - - // normalize bcp key to lowercase - bcpKey = AsciiUtil.toLowerString(bcpKey); - String key = null; - for (int i = 0; i < keyMap.getSize(); i++) { - UResourceBundle mapData = keyMap.get(i); - String tmpBcpKey = mapData.getString(); - if (tmpBcpKey.length() == 0) { - // empty value indicates the BCP47 key is same with the legacy key - tmpBcpKey = mapData.getKey(); - } - if (bcpKey.equals(tmpBcpKey)) { - key = mapData.getKey(); - break; - } - } - if (key == null) { - return bcpKey; - } - return key; - } - - private static String ldmlTypeToBCP47(String key, String type) { - UResourceBundle keyTypeData = UResourceBundle.getBundleInstance( - ICUResourceBundle.ICU_BASE_NAME, - "keyTypeData", - ICUResourceBundle.ICU_DATA_CLASS_LOADER); - UResourceBundle typeMap = keyTypeData.get("typeMap"); - - // keys are case-insensitive, while types are case-sensitive - // TODO: make types case insensitive - key = AsciiUtil.toLowerString(key); - UResourceBundle typeMapForKey = null; - String bcpType = null; - String typeResKey = key.equals("timezone") ? type.replace('/', ':') : type; - try { - typeMapForKey = typeMap.get(key); - bcpType = typeMapForKey.getString(typeResKey); - if (bcpType.length() == 0) { - // empty value indicates the BCP47 type is same with the legacy type - bcpType = type; - } - } catch (MissingResourceException mre) { - // fall through - } - - if (bcpType == null && typeMapForKey != null) { - // is this type alias? - UResourceBundle typeAlias = keyTypeData.get("typeAlias"); - try { - UResourceBundle typeAliasForKey = typeAlias.get(key); - typeResKey = typeAliasForKey.getString(typeResKey); - bcpType = typeMapForKey.getString(typeResKey.replace('/', ':')); - if (bcpType.length() == 0) { - // empty value indicates the BCP47 type is same with the legacy type - bcpType = typeResKey; - } - } catch (MissingResourceException mre) { - // fall through - } - } - - if (bcpType == null) { - int typeLen = type.length(); - if (typeLen >= 3 && typeLen <= 8 && LanguageTag.isExtensionSubtag(type)) { - return type; - } - return null; - } - return bcpType; - } - - private static String bcp47ToLDMLType(String key, String bcpType) { - UResourceBundle keyTypeData = UResourceBundle.getBundleInstance( - ICUResourceBundle.ICU_BASE_NAME, - "keyTypeData", - ICUResourceBundle.ICU_DATA_CLASS_LOADER); - UResourceBundle typeMap = keyTypeData.get("typeMap"); - - // normalize key/bcpType to lowercase - key = AsciiUtil.toLowerString(key); - bcpType = AsciiUtil.toLowerString(bcpType); - - String type = null; - try { - UResourceBundle typeMapForKey = typeMap.get(key); - - // Note: Linear search for time zone ID might be too slow. - // ICU services do not use timezone keywords for now. - // In future, we may need to build the optimized inverse - // lookup table. - - for (int i = 0; i < typeMapForKey.getSize(); i++) { - UResourceBundle mapData = typeMapForKey.get(i); - String tmpBcpType = mapData.getString(); - if (tmpBcpType.length() == 0) { - // empty value indicates the BCP47 type is same with the legacy type - tmpBcpType = mapData.getKey(); - } - if (bcpType.equals(tmpBcpType)) { - type = mapData.getKey(); - if (key.equals("timezone")) { - type = type.replace(':', '/'); - } - break; - } - } - } catch (MissingResourceException mre) { - // fall through - } - - if (type == null) { - return bcpType; - } - return type; - } - /* * JDK Locale Helper */ @@ -4073,9 +4062,9 @@ public final class ULocale implements Serializable, Comparable { if (kwKey.length() != 1) { // Unicode locale key - kwKey = bcp47ToLDMLKey(kwKey); + kwKey = toLegacyKey(kwKey); // use "yes" as the value of typeless keywords - kwVal = bcp47ToLDMLType(kwKey, ((kwVal.length() == 0) ? "yes" : kwVal)); + kwVal = toLegacyType(kwKey, ((kwVal.length() == 0) ? "yes" : kwVal)); } if (addSep) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java index 027980554cc..93c3b3a0991 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -3874,7 +3874,7 @@ public class ULocaleTest extends TestFmwk { {"en@timezone=America/New_York;calendar=japanese", "en-u-ca-japanese-tz-usnyc"}, {"en@timezone=US/Eastern", "en-u-tz-usnyc"}, {"en@x=x-y-z;a=a-b-c", "en-x-x-y-z"}, - {"it@collation=badcollationtype;colStrength=identical;cu=usd-eur", "it-u-ks-identic"}, + {"it@collation=badcollationtype;colStrength=identical;cu=usd-eur", "it-u-cu-usd-eur-ks-identic"}, {"en_US_POSIX", "en-US-u-va-posix"}, {"en_US_POSIX@calendar=japanese;currency=EUR","en-US-u-ca-japanese-cu-eur-va-posix"}, {"@x=elmer", "x-elmer"}, @@ -4449,4 +4449,108 @@ public class ULocaleTest extends TestFmwk { } } } + + public void TestToUnicodeLocaleKey() { + String[][] DATA = { + {"calendar", "ca"}, + {"CALEndar", "ca"}, // difference casing + {"ca", "ca"}, // bcp key itself + {"kv", "kv"}, // no difference between legacy and bcp + {"foo", null}, // unknown, bcp ill-formed + {"ZZ", "zz"}, // unknown, bcp well-formed + }; + + for (String[] d : DATA) { + String keyword = d[0]; + String expected = d[1]; + + String bcpKey = ULocale.toUnicodeLocaleKey(keyword); + assertEquals("keyword=" + keyword, expected, bcpKey); + } + } + + public void TestToLegacyKey() { + String[][] DATA = { + {"kb", "colbackwards"}, + {"kB", "colbackwards"}, // different casing + {"Collation", "collation"}, // keyword itself with different casing + {"kv", "kv"}, // no difference between legacy and bcp + {"foo", "foo"}, // unknown, bcp ill-formed + {"ZZ", "zz"}, // unknown, bcp well-formed + {"e=mc2", null}, // unknown, bcp/legacy ill-formed + }; + + for (String[] d : DATA) { + String keyword = d[0]; + String expected = d[1]; + + String legacyKey = ULocale.toLegacyKey(keyword); + assertEquals("bcpKey=" + keyword, expected, legacyKey); + } + } + + public void TestToUnicodeLocaleType() { + String[][] DATA = { + {"tz", "Asia/Kolkata", "inccu"}, + {"calendar", "gregorian", "gregory"}, + {"ca", "gregorian", "gregory"}, + {"ca", "Gregorian", "gregory"}, + {"ca", "buddhist", "buddhist"}, + {"Calendar", "Japanese", "japanese"}, + {"calendar", "Islamic-Civil", "islamic-civil"}, + {"calendar", "islamicc", "islamic-civil"}, // bcp type alias + {"colalternate", "NON-IGNORABLE", "noignore"}, + {"colcaselevel", "yes", "true"}, + {"tz", "america/new_york", "usnyc"}, + {"tz", "Asia/Kolkata", "inccu"}, + {"timezone", "navajo", "usden"}, + {"ca", "aaaa", "aaaa"}, // unknown type, well-formed type + {"ca", "gregory-japanese-islamic", "gregory-japanese-islamic"}, // unknown type, well-formed type + {"zz", "gregorian", null}, // unknown key, ill-formed type + {"co", "foo-", null}, // unknown type, ill-formed type + }; + + for (String[] d : DATA) { + String keyword = d[0]; + String value = d[1]; + String expected = d[2]; + + String bcpType = ULocale.toUnicodeLocaleType(keyword, value); + assertEquals("keyword=" + keyword + ", value=" + value, expected, bcpType); + } + + } + + public void TestToLegacyType() { + String[][] DATA = { + {"calendar", "gregory", "gregorian"}, + {"ca", "gregory", "gregorian"}, + {"ca", "Gregory", "gregorian"}, + {"ca", "buddhist", "buddhist"}, + {"Calendar", "Japanese", "japanese"}, + {"calendar", "Islamic-Civil", "islamic-civil"}, + {"calendar", "islamicc", "islamic-civil"}, // bcp type alias + {"colalternate", "noignore", "non-ignorable"}, + {"colcaselevel", "true", "yes"}, + {"tz", "usnyc", "America/New_York"}, + {"tz", "inccu", "Asia/Calcutta"}, + {"timezone", "usden", "America/Denver"}, + {"timezone", "usnavajo", "America/Denver"}, // bcp type alias + {"colstrength", "quarternary", "quaternary"}, // type alias + {"ca", "aaaa", "aaaa"}, // unknown type + {"calendar", "gregory-japanese-islamic", "gregory-japanese-islamic"}, // unknown type, well-formed type + {"zz", "gregorian", "gregorian"}, // unknown key, bcp ill-formed type + {"ca", "gregorian-calendar", "gregorian-calendar"}, // known key, bcp ill-formed type + {"co", "e=mc2", null}, // known key, ill-formed bcp/legacy type + }; + + for (String[] d : DATA) { + String keyword = d[0]; + String value = d[1]; + String expected = d[2]; + + String legacyType = ULocale.toLegacyType(keyword, value); + assertEquals("keyword=" + keyword + ", value=" + value, expected, legacyType); + } + } } -- 2.40.0