From 565abe480896735739b21164407ab287c883709e Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 14 Dec 2015 16:14:40 +0000 Subject: [PATCH] ICU-4229 Enhanced the checking further. X-SVN-Rev: 38128 --- .../impl/locale/LocaleValidityChecker.java | 76 ++++++++++++--- .../icu/dev/test/util/TestLocaleValidity.java | 96 ++++++++++++++----- 2 files changed, 132 insertions(+), 40 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleValidityChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleValidityChecker.java index 4e9598d0afa..b51a48f8ca1 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleValidityChecker.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleValidityChecker.java @@ -93,7 +93,7 @@ public class LocaleValidityChecker { if (!isValidT(locale.getExtension(c), where)) return false; break; case u: - if (!isValidU(locale.getExtension(c), where)) return false; + if (!isValidU(locale, locale.getExtension(c), where)) return false; break; } } catch (Exception e) { @@ -104,12 +104,14 @@ public class LocaleValidityChecker { } enum SpecialCase { - normal, anything, reorder, codepoints; + normal, anything, reorder, codepoints, subdivision; static SpecialCase get(String key) { if (key.equals("kr")) { return SpecialCase.reorder; } else if (key.equals("vt")) { return SpecialCase.codepoints; + } else if (key.equals("sd")) { + return subdivision; } else if (key.equals("x0")) { return anything; } else { @@ -118,15 +120,17 @@ public class LocaleValidityChecker { } } /** + * @param locale * @param extension * @param where * @return */ - private boolean isValidU(String extensionString, Where where) { + private boolean isValidU(ULocale locale, String extensionString, Where where) { String key = ""; int typeCount = 0; ValueType valueType = null; SpecialCase specialCase = null; + StringBuilder prefix = new StringBuilder(); // TODO: is empty -u- valid? for (String subtag : SEPARATOR.split(extensionString)) { if (subtag.length() == 2) { @@ -142,8 +146,20 @@ public class LocaleValidityChecker { typeCount = 0; } else { ++typeCount; - if (valueType == ValueType.single && typeCount > 1) { - return where.set(Datatype.u, key+"-"+subtag); + switch (valueType) { + case single: + if (typeCount > 1) { + return where.set(Datatype.u, key+"-"+subtag); + } + break; + case incremental: + if (typeCount == 1) { + prefix.setLength(0); + prefix.append(subtag); + } else { + prefix.append('-').append(subtag); + subtag = prefix.toString(); + } } switch (specialCase) { case anything: @@ -162,8 +178,13 @@ public class LocaleValidityChecker { return where.set(Datatype.u, key+"-"+subtag); } continue; + case subdivision: + if (!isSubdivision(locale, subtag)) { + return where.set(Datatype.u, key+"-"+subtag); + } + continue; } - + // en-u-sd-usca // en-US-u-sd-usca Output isKnownKey = new Output(); @@ -180,6 +201,33 @@ public class LocaleValidityChecker { return true; } + /** + * @param locale + * @param subtag + * @return + */ + private boolean isSubdivision(ULocale locale, String subtag) { + // First check if the subtag is valid + if (subtag.length() < 3) { + return false; + } + String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2); + String subdivision = subtag.substring(region.length()); + if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) { + return false; + } + // Then check for consistency with the locale's region + String localeRegion = locale.getCountry(); + if (localeRegion.isEmpty()) { + ULocale max = ULocale.addLikelySubtags(locale); + localeRegion = max.getCountry(); + } + if (!region.equalsIgnoreCase(localeRegion)) { + return false; + } + return true; + } + static final Set REORDERING_INCLUDE = new HashSet(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others")); static final Set REORDERING_EXCLUDE = new HashSet(Arrays.asList("zinh", "zyyy")); /** @@ -194,12 +242,12 @@ public class LocaleValidityChecker { return false; } return ValidIdentifiers.isValid(Datatype.script, datasubtypes, subtag) != null; -// space, punct, symbol, currency, digit - core groups of characters below 'a' -// any script code except Common and Inherited. -// sc ; Zinh ; Inherited ; Qaai -// sc ; Zyyy ; Common -// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana. -// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false; + // space, punct, symbol, currency, digit - core groups of characters below 'a' + // any script code except Common and Inherited. + // sc ; Zinh ; Inherited ; Qaai + // sc ; Zyyy ; Common + // Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana. + // others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false; } /** @@ -235,14 +283,14 @@ public class LocaleValidityChecker { } public enum ValueType { - single, multiple, specific; + single, multiple, incremental; private static Set multipleValueTypes = new HashSet(Arrays.asList("x0", "kr", "vt")); private static Set specificValueTypes = new HashSet(Arrays.asList("ca")); static ValueType get(String key) { if (multipleValueTypes.contains(key)) { return multiple; } else if (specificValueTypes.contains(key)) { - return specific; + return incremental; } else { return single; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestLocaleValidity.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestLocaleValidity.java index 1f2803d181a..b7ba2b52f7d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestLocaleValidity.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestLocaleValidity.java @@ -34,15 +34,11 @@ public class TestLocaleValidity extends TestFmwk { public void testBasic() { String[][] tests = { - {"OK", "en-u-kr-latn-digit"}, - {"Incomplete extension 'u' [at index 3]", "en-u"}, - {"Incomplete extension 't' [at index 3]", "en-t"}, {"OK", "en-u-ca-chinese"}, {"OK", "en-x-abcdefg"}, {"OK", "x-abcdefg"}, {"OK", "en-u-sd-usca"}, {"OK", "en-US-u-sd-usca"}, - {"OK", "en-AQ-u-sd-usca"}, {"OK", "en-t-it"}, {"OK", "und-Cyrl-t-und-latn"}, {"OK", "root"}, @@ -53,22 +49,9 @@ public class TestLocaleValidity extends TestFmwk { {"OK", "zh-Hant"}, {"OK", "zh-Hant-AQ"}, {"OK", "x-abcdefg-g-foobar"}, - {"Empty subtag [at index 0]", ""}, - {"{u, ca-chinesx}", "en-u-ca-chinesx"}, - {"{illegal, q}", "en-q-abcdefg"}, - {"Incomplete privateuse [at index 0]", "x-abc$defg"}, - {"{script, Latx}", "und-Cyrl-t-und-latx"}, - {"{variant, FOOBAR}", "zh-Hant-1606nict-1694acad-foobar"}, - {"{region, AB}", "zh-Hant-AB"}, - {"{language, ex}", "ex"}, - {"{script, Hanx}", "zh-Hanx"}, - {"{language, qaa}", "qaa"}, - {"Invalid subtag: $ [at index 3]", "EN-$"}, - {"Invalid subtag: $ [at index 0]", "$"}, - // too many items - {"{u, cu-usd}", "en-u-cu-adp-usd"}, {"OK", "en-u-ca-buddhist"}, + {"OK", "en-u-ca-islamic-umalqura"}, // additive {"OK", "en-u-cf-account"}, {"OK", "en-u-co-big5han"}, {"OK", "en-u-cu-adp"}, @@ -80,17 +63,80 @@ public class TestLocaleValidity extends TestFmwk { {"OK", "en-u-kf-false"}, {"OK", "en-u-kk-false"}, {"OK", "en-u-kn-false"}, - {"OK", "en-u-kr-latn-digit-symbol"}, + {"OK", "en-u-kr-latn-digit-symbol"}, // reorder codes, multiple {"OK", "en-u-ks-identic"}, {"OK", "en-u-kv-currency"}, {"OK", "en-u-nu-ahom"}, {"OK", "en-u-sd-usny"}, {"OK", "en-u-tz-adalv"}, {"OK", "en-u-va-posix"}, - {"{u, ca-civil}", "en-u-ca-islamicc"}, // deprecated + + // really long case + + {"OK", "en-u-ca-buddhist-ca-islamic-umalqura-cf-account-co-big5han-cu-adp-fw-fri-hc-h11-ka-noignore-kb-false-kc-false-kf-false-kk-false-kn-false-kr-latn-digit-symbol-ks-identic-kv-currency-nu-ahom-sd-usny-tz-adalv-va-posix"}, + + // deprecated, but turned into valid by ULocale.Builder() + {"OK", "en-u-ca-islamicc"}, // deprecated + {"OK", "en-u-tz-aqams"}, // deprecated + + // Bad syntax (caught by ULocale.Builder()) + + {"Incomplete extension 'u' [at index 3]", "en-u"}, + {"Incomplete extension 't' [at index 3]", "en-t"}, + {"Empty subtag [at index 0]", ""}, + {"Incomplete privateuse [at index 0]", "x-abc$defg"}, + {"Invalid subtag: $ [at index 3]", "EN-$"}, + {"Invalid subtag: $ [at index 0]", "$"}, + + // bad extension + + {"{illegal, q}", "en-q-abcdefg"}, + + // bad subtags + + {"{variant, FOOBAR}", "zh-Hant-1606nict-1694acad-foobar"}, + {"{region, AB}", "zh-Hant-AB"}, + {"{language, ex}", "ex"}, + {"{script, Hanx}", "zh-Hanx"}, + {"{language, qaa}", "qaa"}, + + // bad types for keys + + {"{u, ca-chinesx}", "en-u-ca-chinesx"}, + {"{script, Latx}", "und-Cyrl-t-und-latx"}, + {"{u, sd-usca}", "en-AQ-u-sd-usca"}, + + {"{u, ca-buddhisx}", "en-u-ca-buddhisx"}, + {"{u, ca-islamic-umalqurx}", "en-u-ca-islamic-umalqurx"}, // additive + {"{u, cf-accounx}", "en-u-cf-accounx"}, + {"{u, co-big5hax}", "en-u-co-big5hax"}, + {"{u, cu-adx}", "en-u-cu-adx"}, + {"{u, fw-frx}", "en-u-fw-frx"}, + {"{u, hc-h1x}", "en-u-hc-h1x"}, + {"{u, ka-noignorx}", "en-u-ka-noignorx"}, + {"{u, kb-falsx}", "en-u-kb-falsx"}, + {"{u, kc-falsx}", "en-u-kc-falsx"}, + {"{u, kf-falsx}", "en-u-kf-falsx"}, + {"{u, kk-falsx}", "en-u-kk-falsx"}, + {"{u, kn-falsx}", "en-u-kn-falsx"}, + {"{u, kr-symbox}", "en-u-kr-latn-digit-symbox"}, // reorder codes, multiple + {"{u, ks-identix}", "en-u-ks-identix"}, + {"{u, kv-currencx}", "en-u-kv-currencx"}, + {"{u, nu-ahox}", "en-u-nu-ahox"}, + {"{u, sd-usnx}", "en-u-sd-usnx"}, + {"{u, tz-adalx}", "en-u-tz-adalx"}, + {"{u, va-posit}", "en-u-va-posit"}, + + + // too many items + + {"{u, cu-usd}", "en-u-cu-adp-usd"}, + + // use deprecated subtags. testDeprecated checks if they work when Datasubtype.deprecated is added + //{"{u, ca-civil}", "en-u-ca-islamicc"}, // deprecated, but turns into valid {"{u, co-direct}", "en-u-co-direct"}, // deprecated {"{u, kh}", "en-u-kh-false"}, // deprecated - {"{u, tz-aqams}", "en-u-tz-aqams"}, // deprecated + {"{u, tz-camtr}", "en-u-tz-camtr"}, // deprecated {"{u, vt}", "en-u-vt-0020-0041"}, // deprecated }; check(tests, Datasubtype.regular, Datasubtype.unknown); @@ -120,12 +166,10 @@ public class TestLocaleValidity extends TestFmwk { } public void testDeprecated() { - LocaleValidityChecker regularAndDeprecated = new LocaleValidityChecker(EnumSet.of(Datasubtype.regular, Datasubtype.deprecated)); String[][] tests = { - {"OK", "en-u-ca-islamicc"}, // deprecated {"OK", "en-u-co-direct"}, // deprecated {"OK", "en-u-kh-false"}, // deprecated - {"OK", "en-u-tz-aqams"}, // deprecated + {"OK", "en-u-tz-camtr"}, // deprecated {"OK", "en-u-vt-0020"}, // deprecated }; check(tests, Datasubtype.regular, Datasubtype.unknown, Datasubtype.deprecated); @@ -133,9 +177,9 @@ public class TestLocaleValidity extends TestFmwk { private void check(String[][] tests, Datasubtype... datasubtypes) { int count = 0; - LocaleValidityChecker regularAndUnknown = new LocaleValidityChecker(datasubtypes); + LocaleValidityChecker localeValidityChecker = new LocaleValidityChecker(datasubtypes); for (String[] test : tests) { - check(++count, regularAndUnknown, test[0], test[1]); + check(++count, localeValidityChecker, test[0], test[1]); } } -- 2.40.0