ICU-20108 Adding new parseLenients from CLDR 34 to ICU.

author Shane Carr <shane@unicode.org>

Fri, 15 Feb 2019 05:43:32 +0000 (21:43 -0800)

committer Shane F. Carr <shane@unicode.org>

Sat, 16 Feb 2019 00:51:17 +0000 (16:51 -0800)
author Shane Carr <shane@unicode.org>
Fri, 15 Feb 2019 05:43:32 +0000 (21:43 -0800)
committer Shane F. Carr <shane@unicode.org>
Sat, 16 Feb 2019 00:51:17 +0000 (16:51 -0800)
diff --git a/icu4c/source/common/static_unicode_sets.cpp b/icu4c/source/common/static_unicode_sets.cpp

index 5d598a0e33b6d46be0f650539d4c4dccde0e63de..5dab3931a707fdd7b44e8d7f1977e674051a6eea 100644 (file)
--- a/icu4c/source/common/static_unicode_sets.cpp
+++ b/icu4c/source/common/static_unicode_sets.cpp
@@ -23,7 +23,7 @@ using namespace icu::unisets;
  
  namespace {
  
-UnicodeSet* gUnicodeSets[COUNT] = {};
+UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
  
  // Save the empty instance in static memory to have well-defined behavior if a
  // regular UnicodeSet cannot be allocated.
@@ -97,14 +97,28 @@ class ParseDataSink : public ResourceSink {
                              saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
                          } else if (str.indexOf(u'+') != -1) {
                              saveSet(PLUS_SIGN, str, status);
-                        } else if (str.indexOf(u'‒') != -1) {
+                        } else if (str.indexOf(u'-') != -1) {
                              saveSet(MINUS_SIGN, str, status);
                          } else if (str.indexOf(u'$') != -1) {
                              saveSet(DOLLAR_SIGN, str, status);
                          } else if (str.indexOf(u'£') != -1) {
                              saveSet(POUND_SIGN, str, status);
-                        } else if (str.indexOf(u'â\82¨') != -1) {
+                        } else if (str.indexOf(u'â\82¹') != -1) {
                              saveSet(RUPEE_SIGN, str, status);
+                        } else if (str.indexOf(u'¥') != -1) {
+                            saveSet(YEN_SIGN, str, status);
+                        } else if (str.indexOf(u'₩') != -1) {
+                            saveSet(WON_SIGN, str, status);
+                        } else if (str.indexOf(u'%') != -1) {
+                            saveSet(PERCENT_SIGN, str, status);
+                        } else if (str.indexOf(u'‰') != -1) {
+                            saveSet(PERMILLE_SIGN, str, status);
+                        } else if (str.indexOf(u'’') != -1) {
+                            saveSet(APOSTROPHE_SIGN, str, status);
+                        } else {
+                            // Unknown class of parse lenients
+                            // TODO(ICU-20428): Make ICU automatically accept new classes?
+                            U_ASSERT(FALSE);
                          }
                          if (U_FAILURE(status)) { return; }
                      }
@@ -122,7 +136,7 @@ UBool U_CALLCONV cleanupNumberParseUniSets() {
          reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
          gEmptyUnicodeSetInitialized = FALSE;
      }
-    for (int32_t i = 0; i < COUNT; i++) {
+    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
          delete gUnicodeSets[i];
          gUnicodeSets[i] = nullptr;
      }
@@ -155,27 +169,35 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
      U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
      U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
      U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
+    U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
  
-    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
-            u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
+    LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
+        u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
+        status
+    ), status);
+    if (U_FAILURE(status)) { return; }
+    otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
+    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
      gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
      gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
              STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
  
      U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
      U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
  
-    gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
-    gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
-    gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
+    gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
+    if (U_FAILURE(status)) { return; }
  
      U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
      U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
      U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
-    gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);
+    U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
  
      gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
-
+    if (U_FAILURE(status)) { return; }
      gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
      gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
  
diff --git a/icu4c/source/common/static_unicode_sets.h b/icu4c/source/common/static_unicode_sets.h

index 0332ee663730cff7a30858bb56cd4ee305e6ea11..5d90ce5908de9817be4069ed717ff56f04d6fa0d 100644 (file)
--- a/icu4c/source/common/static_unicode_sets.h
+++ b/icu4c/source/common/static_unicode_sets.h
@@ -45,6 +45,7 @@ enum Key {
      PERIOD,
      STRICT_COMMA,
      STRICT_PERIOD,
+    APOSTROPHE_SIGN,
      OTHER_GROUPING_SEPARATORS,
      ALL_SEPARATORS,
      STRICT_ALL_SEPARATORS,
@@ -54,13 +55,14 @@ enum Key {
      PLUS_SIGN,
      PERCENT_SIGN,
      PERMILLE_SIGN,
-    INFINITY_KEY, // INFINITY is defined in cmath
+    INFINITY_SIGN,
  
      // Currency Symbols
      DOLLAR_SIGN,
      POUND_SIGN,
      RUPEE_SIGN,
-    YEN_SIGN, // not in CLDR data, but Currency.java wants it
+    YEN_SIGN,
+    WON_SIGN,
  
      // Other
      DIGITS,
@@ -70,7 +72,7 @@ enum Key {
      DIGITS_OR_STRICT_ALL_SEPARATORS,
  
      // The number of elements in the enum.
-    COUNT
+    UNISETS_KEY_COUNT
  };
  
  /**
@@ -126,8 +128,9 @@ static const struct {
  } kCurrencyEntries[] = {
      {DOLLAR_SIGN, u'$'},
      {POUND_SIGN, u'£'},
-    {RUPEE_SIGN, u'â\82¨'},
+    {RUPEE_SIGN, u'â\82¹'},
      {YEN_SIGN, u'¥'},
+    {WON_SIGN, u'₩'},
  };
  
  } // namespace unisets
diff --git a/icu4c/source/i18n/numparse_symbols.cpp b/icu4c/source/i18n/numparse_symbols.cpp

index 9ccceec8475d01a141cd1e1ebc1dd48b105334a5..e0daab9374f8b1b8ec0e357829160b433d3d6d18 100644 (file)
--- a/icu4c/source/i18n/numparse_symbols.cpp
+++ b/icu4c/source/i18n/numparse_symbols.cpp
@@ -90,7 +90,7 @@ void IgnorablesMatcher::accept(StringSegment&, ParsedNumber&) const {
  
  
  InfinityMatcher::InfinityMatcher(const DecimalFormatSymbols& dfs)
-        : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol), unisets::INFINITY_KEY) {
+        : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol), unisets::INFINITY_SIGN) {
  }
  
  bool InfinityMatcher::isDisabled(const ParsedNumber& result) const {
diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp

index e391f5904e17725af58ebb11e573a60e90086af0..53c527cc06a08cd92e6830d66d7902ab1078c8c6 100644 (file)
--- a/icu4c/source/test/intltest/numbertest_parse.cpp
+++ b/icu4c/source/test/intltest/numbertest_parse.cpp
@@ -14,8 +14,6 @@
  #include <cmath>
  #include <numparse_affixes.h>
  
-using icu::unisets::get;
-
  void NumberParserTest::runIndexedTest(int32_t index, UBool exec, const char*& name, char*) {
      if (exec) {
          logln("TestSuite NumberParserTest: ");
diff --git a/icu4c/source/test/intltest/static_unisets_test.cpp b/icu4c/source/test/intltest/static_unisets_test.cpp

index bfe699635269efd54937027558cb489943ac5c20..5cc946bc225d4475f8b0bb6e0a2c09a94e705d9c 100644 (file)
--- a/icu4c/source/test/intltest/static_unisets_test.cpp
+++ b/icu4c/source/test/intltest/static_unisets_test.cpp
@@ -34,7 +34,10 @@ void StaticUnicodeSetsTest::runIndexedTest(int32_t index, UBool exec, const char
          logln("TestSuite StaticUnicodeSetsTest: ");
      }
      TESTCASE_AUTO_BEGIN;
-        TESTCASE_AUTO(testSetCoverage);
+        if (!quick) {
+            // Slow test: run in exhaustive mode only
+            TESTCASE_AUTO(testSetCoverage);
+        }
          TESTCASE_AUTO(testNonEmpty);
      TESTCASE_AUTO_END;
  }
@@ -64,7 +67,7 @@ void StaticUnicodeSetsTest::testSetCoverage() {
      const UnicodeSet &minusSign = *get(unisets::MINUS_SIGN);
      const UnicodeSet &percent = *get(unisets::PERCENT_SIGN);
      const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN);
-    const UnicodeSet &infinity = *get(unisets::INFINITY_KEY);
+    const UnicodeSet &infinity = *get(unisets::INFINITY_SIGN);
  
      int32_t localeCount;
      const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount);
@@ -87,7 +90,7 @@ void StaticUnicodeSetsTest::testSetCoverage() {
  }
  
  void StaticUnicodeSetsTest::testNonEmpty() {
-    for (int32_t i=0; i<unisets::COUNT; i++) {
+    for (int32_t i=0; i<unisets::UNISETS_KEY_COUNT; i++) {
          if (i == unisets::EMPTY) {
              continue;
          }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java

index 63f250a012aca982699d07fe1d71c5458f22b2e9..18c8c9bf10acd94e0a670d9aa804c26330666176 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java
@@ -38,6 +38,7 @@ public class StaticUnicodeSets {
          PERIOD,
          STRICT_COMMA,
          STRICT_PERIOD,
+        APOSTROPHE_SIGN,
          OTHER_GROUPING_SEPARATORS,
          ALL_SEPARATORS,
          STRICT_ALL_SEPARATORS,
@@ -48,13 +49,14 @@ public class StaticUnicodeSets {
          PLUS_SIGN,
          PERCENT_SIGN,
          PERMILLE_SIGN,
-        INFINITY,
+        INFINITY_SIGN,
  
          // Currency Symbols
          DOLLAR_SIGN,
          POUND_SIGN,
          RUPEE_SIGN,
-        YEN_SIGN, // not in CLDR data, but Currency.java wants it
+        YEN_SIGN,
+        WON_SIGN,
  
          // Other
          DIGITS,
@@ -64,7 +66,7 @@ public class StaticUnicodeSets {
          DIGITS_OR_STRICT_ALL_SEPARATORS,
      };
  
-    private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
+    private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<>(Key.class);
  
      /**
       * Gets the static-allocated UnicodeSet according to the provided key.
@@ -126,6 +128,8 @@ public class StaticUnicodeSets {
              return Key.RUPEE_SIGN;
          } else if (get(Key.YEN_SIGN).contains(str)) {
              return Key.YEN_SIGN;
+        } else if (get(Key.WON_SIGN).contains(str)) {
+            return Key.WON_SIGN;
          } else {
              return null;
          }
@@ -197,14 +201,27 @@ public class StaticUnicodeSets {
                                  saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
                              } else if (str.indexOf('+') != -1) {
                                  saveSet(Key.PLUS_SIGN, str);
-                            } else if (str.indexOf('‒') != -1) {
+                            } else if (str.indexOf('-') != -1) {
                                  saveSet(Key.MINUS_SIGN, str);
                              } else if (str.indexOf('$') != -1) {
                                  saveSet(Key.DOLLAR_SIGN, str);
                              } else if (str.indexOf('£') != -1) {
                                  saveSet(Key.POUND_SIGN, str);
-                            } else if (str.indexOf('â\82¨') != -1) {
+                            } else if (str.indexOf('â\82¹') != -1) {
                                  saveSet(Key.RUPEE_SIGN, str);
+                            } else if (str.indexOf('¥') != -1) {
+                                saveSet(Key.YEN_SIGN, str);
+                            } else if (str.indexOf('₩') != -1) {
+                                saveSet(Key.WON_SIGN, str);
+                            } else if (str.indexOf('%') != -1) {
+                                saveSet(Key.PERCENT_SIGN, str);
+                            } else if (str.indexOf('‰') != -1) {
+                                saveSet(Key.PERMILLE_SIGN, str);
+                            } else if (str.indexOf('’') != -1) {
+                                saveSet(Key.APOSTROPHE_SIGN, str);
+                            } else {
+                                // TODO(ICU-20428): Make ICU automatically accept new classes?
+                                throw new AssertionError("Unknown class of parse lenients: " + str);
                              }
                          }
                      }
@@ -230,9 +247,12 @@ public class StaticUnicodeSets {
          assert unicodeSets.containsKey(Key.STRICT_COMMA);
          assert unicodeSets.containsKey(Key.PERIOD);
          assert unicodeSets.containsKey(Key.STRICT_PERIOD);
+        assert unicodeSets.containsKey(Key.APOSTROPHE_SIGN);
  
-        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
-                new UnicodeSet("['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze());
+        UnicodeSet otherGrouping = new UnicodeSet(
+                "[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]");
+        otherGrouping.addAll(unicodeSets.get(Key.APOSTROPHE_SIGN));
+        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze());
          unicodeSets.put(Key.ALL_SEPARATORS,
                  computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
          unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
@@ -240,15 +260,16 @@ public class StaticUnicodeSets {
  
          assert unicodeSets.containsKey(Key.MINUS_SIGN);
          assert unicodeSets.containsKey(Key.PLUS_SIGN);
+        assert unicodeSets.containsKey(Key.PERCENT_SIGN);
+        assert unicodeSets.containsKey(Key.PERMILLE_SIGN);
  
-        unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
-        unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
-        unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
+        unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze());
  
          assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
          assert unicodeSets.containsKey(Key.POUND_SIGN);
          assert unicodeSets.containsKey(Key.RUPEE_SIGN);
-        unicodeSets.put(Key.YEN_SIGN, new UnicodeSet("[¥\\uffe5]").freeze());
+        assert unicodeSets.containsKey(Key.YEN_SIGN);
+        assert unicodeSets.containsKey(Key.WON_SIGN);
  
          unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java

index 0aa915aca63db764c6abbd99c18e72124e34f2a6..54d683aceedf7ed8f0667f71dd5f88bcaed81963 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java
@@ -30,7 +30,7 @@ public class InfinityMatcher extends SymbolMatcher {
      }
  
      private InfinityMatcher() {
-        super(StaticUnicodeSets.Key.INFINITY);
+        super(StaticUnicodeSets.Key.INFINITY_SIGN);
      }
  
      @Override
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java

index 8813857517e08097a1809198c0e625a01fcf0d5c..450f08ce725cc398cfff58610c8a2f7bfd803e12 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java
@@ -50,7 +50,7 @@ public class ExhaustiveNumberTest extends TestFmwk {
          UnicodeSet minusSign = get(Key.MINUS_SIGN);
          UnicodeSet percent = get(Key.PERCENT_SIGN);
          UnicodeSet permille = get(Key.PERMILLE_SIGN);
-        UnicodeSet infinity = get(Key.INFINITY);
+        UnicodeSet infinity = get(Key.INFINITY_SIGN);
  
          for (ULocale locale : ULocale.getAvailableLocales()) {
              DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
author	Shane Carr <shane@unicode.org>
	Fri, 15 Feb 2019 05:43:32 +0000 (21:43 -0800)
committer	Shane F. Carr <shane@unicode.org>
	Sat, 16 Feb 2019 00:51:17 +0000 (16:51 -0800)
icu4c/source/common/static_unicode_sets.cpp		patch \| blob \| history
icu4c/source/common/static_unicode_sets.h		patch \| blob \| history
icu4c/source/i18n/numparse_symbols.cpp		patch \| blob \| history
icu4c/source/test/intltest/numbertest_parse.cpp		patch \| blob \| history
icu4c/source/test/intltest/static_unisets_test.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/StaticUnicodeSets.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java		patch \| blob \| history