ICU-13697 Adding data-loading logic for parseLenients sets in CLDR. Ties the sets...

author Shane Carr <shane@unicode.org>

Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)

committer Shane Carr <shane@unicode.org>

Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)
author Shane Carr <shane@unicode.org>
Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)
committer Shane Carr <shane@unicode.org>
Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)
diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in

index cf0799aed148f9646fd66cf1b78bf640b635aa5e..2025b85ee1121b136a45ee978a0fbbe58c2f2e9c 100644 (file)
--- a/icu4c/source/common/Makefile.in
+++ b/icu4c/source/common/Makefile.in
@@ -111,7 +111,8 @@ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o p
  ulist.o uloc_tag.o icudataver.o icuplug.o listformatter.o ulistformatter.o \
  sharedobject.o simpleformatter.o unifiedcache.o uloc_keytype.o \
  ubiditransform.o \
-pluralmap.o
+pluralmap.o \
+numparse_unisets.o
  
  ## Header files to install
  HEADERS = $(srcdir)/unicode/*.h
diff --git a/icu4c/source/common/numparse_unisets.cpp b/icu4c/source/common/numparse_unisets.cpp

new file mode 100644 (file)

index 0000000..3aa5b5b
--- /dev/null
+++ b/icu4c/source/common/numparse_unisets.cpp
@@ -0,0 +1,200 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+// Allow implicit conversion from char16_t* to UnicodeString for this file:
+// Helpful in toString methods and elsewhere.
+#define UNISTR_FROM_STRING_EXPLICIT
+
+#include "numparse_unisets.h"
+#include "umutex.h"
+#include "ucln_cmn.h"
+#include "unicode/uniset.h"
+#include "uresimp.h"
+#include "cstring.h"
+#include "uassert.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+using namespace icu::numparse::impl::unisets;
+
+
+namespace {
+
+static UnicodeSet* gUnicodeSets[COUNT] = {};
+
+UnicodeSet* computeUnion(Key k1, Key k2) {
+    UnicodeSet* result = new UnicodeSet();
+    if (result == nullptr) {
+        return nullptr;
+    }
+    result->addAll(*gUnicodeSets[k1]);
+    result->addAll(*gUnicodeSets[k2]);
+    result->freeze();
+    return result;
+}
+
+UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
+    UnicodeSet* result = new UnicodeSet();
+    if (result == nullptr) {
+        return nullptr;
+    }
+    result->addAll(*gUnicodeSets[k1]);
+    result->addAll(*gUnicodeSets[k2]);
+    result->addAll(*gUnicodeSets[k3]);
+    result->freeze();
+    return result;
+}
+
+
+void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
+    // assert unicodeSets.get(key) == null;
+    gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
+}
+
+class ParseDataSink : public ResourceSink {
+  public:
+    void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
+        ResourceTable contextsTable = value.getTable(status);
+        if (U_FAILURE(status)) { return; }
+        for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
+            if (uprv_strcmp(key, "date") == 0) {
+                // ignore
+            } else {
+                ResourceTable strictnessTable = value.getTable(status);
+                if (U_FAILURE(status)) { return; }
+                for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
+                    bool isLenient = (uprv_strcmp(key, "lenient") == 0);
+                    ResourceArray array = value.getArray(status);
+                    if (U_FAILURE(status)) { return; }
+                    for (int k = 0; k < array.getSize(); k++) {
+                        array.getValue(k, value);
+                        UnicodeString str = value.getUnicodeString(status);
+                        if (U_FAILURE(status)) { return; }
+                        // There is both lenient and strict data for comma/period,
+                        // but not for any of the other symbols.
+                        if (str.indexOf(u'.') != -1) {
+                            saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
+                        } else if (str.indexOf(u',') != -1) {
+                            saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
+                        } else if (str.indexOf(u'+') != -1) {
+                            saveSet(PLUS_SIGN, str, status);
+                        } else if (str.indexOf(u'‒') != -1) {
+                            saveSet(MINUS_SIGN, str, status);
+                        } else if (str.indexOf(u'$') != -1) {
+                            saveSet(DOLLAR_SIGN, str, status);
+                        } else if (str.indexOf(u'£') != -1) {
+                            saveSet(POUND_SIGN, str, status);
+                        } else if (str.indexOf(u'₨') != -1) {
+                            saveSet(RUPEE_SIGN, str, status);
+                        }
+                        if (U_FAILURE(status)) { return; }
+                    }
+                }
+            }
+        }
+    }
+};
+
+
+icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
+
+UBool U_CALLCONV cleanupNumberParseUniSets() {
+    for (int32_t i = 0; i < COUNT; i++) {
+        delete gUnicodeSets[i];
+        gUnicodeSets[i] = nullptr;
+    }
+    return TRUE;
+}
+
+void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
+    ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
+
+    gUnicodeSets[EMPTY] = new UnicodeSet();
+
+    // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
+    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
+    gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
+            u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
+    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
+
+    LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
+    if (U_FAILURE(status)) { return; }
+    ParseDataSink sink;
+    ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
+    if (U_FAILURE(status)) { return; }
+
+    // TODO: Should there be fallback behavior if for some reason these sets didn't get populated?
+    U_ASSERT(gUnicodeSets[COMMA] != nullptr);
+    U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
+    U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
+    U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
+
+    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
+            u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
+    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
+    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
+            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
+
+    U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
+
+    gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
+    gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
+    gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
+
+    U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
+    U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
+    gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);
+
+    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
+
+    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
+    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
+
+    for (int32_t i = 0; i < COUNT; i++) {
+        gUnicodeSets[i]->freeze();
+    }
+}
+
+}
+
+const UnicodeSet* unisets::get(Key key) {
+    UErrorCode localStatus = U_ZERO_ERROR;
+    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
+    if (U_FAILURE(localStatus)) {
+        // TODO: This returns non-null in Java, and callers assume that.
+        return nullptr;
+    }
+    return gUnicodeSets[key];
+}
+
+Key unisets::chooseFrom(UnicodeString str, Key key1) {
+    return get(key1)->contains(str) ? key1 : COUNT;
+}
+
+Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
+    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
+}
+
+//Key unisets::chooseCurrency(UnicodeString str) {
+//    if (get(DOLLAR_SIGN)->contains(str)) {
+//        return DOLLAR_SIGN;
+//    } else if (get(POUND_SIGN)->contains(str)) {
+//        return POUND_SIGN;
+//    } else if (get(RUPEE_SIGN)->contains(str)) {
+//        return RUPEE_SIGN;
+//    } else if (get(YEN_SIGN)->contains(str)) {
+//        return YEN_SIGN;
+//    } else {
+//        return COUNT;
+//    }
+//}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_unisets.h b/icu4c/source/common/numparse_unisets.h

similarity index 71%

rename from icu4c/source/i18n/numparse_unisets.h

rename to icu4c/source/common/numparse_unisets.h

index 97a44ea860d6543449f7b860d07496f4adf697f7..7cf3f6aeb1588d15c0b19f741bf43c4233b9a16e 100644 (file)
--- a/icu4c/source/i18n/numparse_unisets.h
+++ b/icu4c/source/common/numparse_unisets.h
@@ -1,14 +1,16 @@
  // © 2018 and later: Unicode, Inc. and others.
  // License & terms of use: http://www.unicode.org/copyright.html
  
+// This file is in common instead of i18n because it is needed by ucurr.cpp.
+
  #include "unicode/utypes.h"
  
  #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
  #ifndef __NUMPARSE_UNISETS_H__
  #define __NUMPARSE_UNISETS_H__
  
-#include "numparse_types.h"
  #include "unicode/uniset.h"
+#include "unicode/unistr.h"
  
  U_NAMESPACE_BEGIN namespace numparse {
  namespace impl {
@@ -18,8 +20,6 @@ enum Key {
      EMPTY,
  
      // Ignorables
-            BIDI,
-    WHITESPACE,
      DEFAULT_IGNORABLES,
      STRICT_IGNORABLES,
  
@@ -29,7 +29,7 @@ enum Key {
      // - PERIOD is a superset of SCRICT_PERIOD
      // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
      // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
-            COMMA,
+    COMMA,
      PERIOD,
      STRICT_COMMA,
      STRICT_PERIOD,
@@ -38,23 +38,27 @@ enum Key {
      STRICT_ALL_SEPARATORS,
  
      // Symbols
-    // TODO: NaN?
-            MINUS_SIGN,
+    MINUS_SIGN,
      PLUS_SIGN,
      PERCENT_SIGN,
      PERMILLE_SIGN,
      INFINITY_KEY, // INFINITY is defined in cmath
  
+    // Currency Symbols
+    DOLLAR_SIGN,
+    POUND_SIGN,
+    RUPEE_SIGN,
+    YEN_SIGN, // not in CLDR data, but Currency.java wants it
+
      // Other
-            DIGITS,
-    CWCF,
+    DIGITS,
  
      // Combined Separators with Digits (for lead code points)
-            DIGITS_OR_ALL_SEPARATORS,
+    DIGITS_OR_ALL_SEPARATORS,
      DIGITS_OR_STRICT_ALL_SEPARATORS,
  
      // The number of elements in the enum.  Also used to indicate null.
-            COUNT
+    COUNT
  };
  
  const UnicodeSet* get(Key key);
@@ -63,6 +67,19 @@ Key chooseFrom(UnicodeString str, Key key1);
  
  Key chooseFrom(UnicodeString str, Key key1, Key key2);
  
+// Unused in C++:
+// Key chooseCurrency(UnicodeString str);
+// Used instead:
+static const struct {
+    Key key;
+    UChar32 exemplar;
+} kCurrencyEntries[] = {
+    {DOLLAR_SIGN, u'$'},
+    {POUND_SIGN, u'£'},
+    {RUPEE_SIGN, u'₨'},
+    {YEN_SIGN, u'¥'},
+};
+
  } // namespace unisets
  } // namespace impl
  } // namespace numparse
diff --git a/icu4c/source/common/ucln_cmn.h b/icu4c/source/common/ucln_cmn.h

index 5db94945172c3d5a47fd9b2ec67ce3f62bf2f3aa..9b6c2058135c5261dcf7a0bd554aa2f5f9f973e4 100644 (file)
--- a/icu4c/source/common/ucln_cmn.h
+++ b/icu4c/source/common/ucln_cmn.h
@@ -33,6 +33,7 @@ Please keep the order of enums declared in same order
  as the cleanup functions are suppose to be called. */
  typedef enum ECleanupCommonType {
      UCLN_COMMON_START = -1,
+    UCLN_COMMON_NUMPARSE_UNISETS,
      UCLN_COMMON_USPREP,
      UCLN_COMMON_BREAKITERATOR,
      UCLN_COMMON_RBBI,
diff --git a/icu4c/source/common/ucurr.cpp b/icu4c/source/common/ucurr.cpp

index 6ce53c2d5e5f733ca8c552ec908ac423823c8768..1fd02ec30b56553f788b59571a71a4ec5dc5a4ae 100644 (file)
--- a/icu4c/source/common/ucurr.cpp
+++ b/icu4c/source/common/ucurr.cpp
@@ -17,11 +17,13 @@
  #include "unicode/ustring.h"
  #include "unicode/parsepos.h"
  #include "unicode/uniset.h"
+#include "unicode/usetiter.h"
  #include "unicode/utf16.h"
  #include "ustr_imp.h"
  #include "charstr.h"
  #include "cmemory.h"
  #include "cstring.h"
+#include "numparse_unisets.h"
  #include "uassert.h"
  #include "umutex.h"
  #include "ucln_cmn.h"
@@ -67,14 +69,6 @@ static const int32_t POW10[] = { 1, 10, 100, 1000, 10000, 100000,
  
  static const int32_t MAX_POW10 = UPRV_LENGTHOF(POW10) - 1;
  
-// Defines equivalent currency symbols.
-static const char *EQUIV_CURRENCY_SYMBOLS[][2] = {
-    {"\\u00a5", "\\uffe5"},
-    {"$", "\\ufe69"},
-    {"$", "\\uff04"},
-    {"\\u20a8", "\\u20b9"},
-    {"\\u00a3", "\\u20a4"}};
-
  #define ISO_CURRENCY_CODE_LENGTH 3
  
  //------------------------------------------------------------
@@ -2207,16 +2201,21 @@ static void U_CALLCONV initIsoCodes(UErrorCode &status) {
  }
  
  static void populateCurrSymbolsEquiv(icu::Hashtable *hash, UErrorCode &status) {
-    if (U_FAILURE(status)) {
-        return;
-    }
-    int32_t length = UPRV_LENGTHOF(EQUIV_CURRENCY_SYMBOLS);
-    for (int32_t i = 0; i < length; ++i) {
-        icu::UnicodeString lhs(EQUIV_CURRENCY_SYMBOLS[i][0], -1, US_INV);
-        icu::UnicodeString rhs(EQUIV_CURRENCY_SYMBOLS[i][1], -1, US_INV);
-        makeEquivalent(lhs.unescape(), rhs.unescape(), hash, status);
-        if (U_FAILURE(status)) {
-            return;
+    using namespace icu::numparse::impl;
+    if (U_FAILURE(status)) { return; }
+    for (auto& entry : unisets::kCurrencyEntries) {
+        UnicodeString exemplar(entry.exemplar);
+        const UnicodeSet* set = unisets::get(entry.key);
+        if (set == nullptr) { return; }
+        UnicodeSetIterator it(*set);
+        while (it.next()) {
+            UnicodeString value = it.getString();
+            if (value == exemplar) {
+                // No need to mark the exemplar character as an equivalent
+                continue;
+            }
+            makeEquivalent(exemplar, value, hash, status);
+            if (U_FAILURE(status)) { return; }
          }
      }
  }
diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in

index b4fafdf72f8479bcfbdd3789dbea2051c38c18bb..a66b65a8744847e4db003a121119b188543eb459 100644 (file)
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -92,7 +92,7 @@ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.
  wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
  standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
  tmunit.o tmutamt.o tmutfmt.o currpinf.o \
-uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \
+uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o smpdtfst.o \
  ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \
  decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
  tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \
@@ -107,7 +107,7 @@ number_mapper.o number_multiplier.o number_currencysymbols.o number_skeletons.o
  double-conversion.o double-conversion-bignum-dtoa.o double-conversion-bignum.o \
  double-conversion-cached-powers.o double-conversion-diy-fp.o \
  double-conversion-fast-dtoa.o double-conversion-strtod.o \
-numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o numparse_impl.o \
+numparse_stringsegment.o numparse_parsednumber.o numparse_impl.o \
  numparse_symbols.o numparse_decimal.o numparse_scientific.o numparse_currency.o \
  numparse_affixes.o numparse_compositions.o numparse_validators.o \
  
diff --git a/icu4c/source/i18n/decfmtst.cpp b/icu4c/source/i18n/decfmtst.cpp

deleted file mode 100644 (file)

index e939ab4..0000000
--- a/icu4c/source/i18n/decfmtst.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-*******************************************************************************
-* Copyright (C) 2009-2016, International Business Machines Corporation and
-* others. All Rights Reserved.
-*******************************************************************************
-*
-* This file contains the class DecimalFormatStaticSets
-*
-* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient
-* parsing of decimal and group separators.
-********************************************************************************
-*/
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_FORMATTING
-
-#include "unicode/unistr.h"
-#include "unicode/uniset.h"
-#include "unicode/uchar.h"
-#include "cmemory.h"
-#include "cstring.h"
-#include "uassert.h"
-#include "ucln_in.h"
-#include "umutex.h"
-
-#include "decfmtst.h"
-
-U_NAMESPACE_BEGIN
-
-
-//------------------------------------------------------------------------------
-//
-// Unicode Set pattern strings for all of the required constant sets.
-//               Initialized with hex values for portability to EBCDIC based machines.
-//                Really ugly, but there's no good way to avoid it.
-//
-//------------------------------------------------------------------------------
-
-static const UChar gDotEquivalentsPattern[] = {
-        // [       .    \u2024  \u3002  \uFE12  \uFE52  \uFF0E  \uFF61     ]
-        0x005B, 0x002E, 0x2024, 0x3002, 0xFE12, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000};
-
-static const UChar gCommaEquivalentsPattern[] = {
-        // [       ,    \u060C  \u066B  \u3001  \uFE10  \uFE11  \uFE50  \uFE51  \uFF0C  \uFF64    ]
-        0x005B, 0x002C, 0x060C, 0x066B, 0x3001, 0xFE10, 0xFE11, 0xFE50, 0xFE51, 0xFF0C, 0xFF64, 0x005D, 0x0000};
-
-static const UChar gOtherGroupingSeparatorsPattern[] = {
-        // [       \     SPACE     '      NBSP  \u066C  \u2000     -    \u200A  \u2018  \u2019  \u202F  \u205F  \u3000  \uFF07     ]
-        0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000};
-
-static const UChar gDashEquivalentsPattern[] = {
-        // [       \      -     HYPHEN  F_DASH  N_DASH   MINUS     ]
-        0x005B, 0x005C, 0x002D, 0x2010, 0x2012, 0x2013, 0x2212, 0x005D, 0x0000};
-
-static const UChar gStrictDotEquivalentsPattern[] = {
-        // [      .     \u2024  \uFE52  \uFF0E  \uFF61    ]
-        0x005B, 0x002E, 0x2024, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000};
-
-static const UChar gStrictCommaEquivalentsPattern[] = {
-        // [       ,    \u066B  \uFE10  \uFE50  \uFF0C     ]
-        0x005B, 0x002C, 0x066B, 0xFE10, 0xFE50, 0xFF0C, 0x005D, 0x0000};
-
-static const UChar gStrictOtherGroupingSeparatorsPattern[] = {
-        // [       \     SPACE     '      NBSP  \u066C  \u2000     -    \u200A  \u2018  \u2019  \u202F  \u205F  \u3000  \uFF07     ]
-        0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000};
-
-static const UChar gStrictDashEquivalentsPattern[] = {
-        // [       \      -      MINUS     ]
-        0x005B, 0x005C, 0x002D, 0x2212, 0x005D, 0x0000};
-
-static const UChar32 gMinusSigns[] = {
-    0x002D,
-    0x207B,
-    0x208B,
-    0x2212,
-    0x2796,
-    0xFE63,
-    0xFF0D};
-
-static const UChar32 gPlusSigns[] = {
-    0x002B,
-    0x207A,
-    0x208A,
-    0x2795,
-    0xfB29,
-    0xFE62,
-    0xFF0B};
-
-static void initUnicodeSet(const UChar32 *raw, int32_t len, UnicodeSet *s) {
-    for (int32_t i = 0; i < len; ++i) {
-        s->add(raw[i]);
-    }
-}
-
-DecimalFormatStaticSets::DecimalFormatStaticSets(UErrorCode &status)
-: fDotEquivalents(NULL),
-  fCommaEquivalents(NULL),
-  fOtherGroupingSeparators(NULL),
-  fDashEquivalents(NULL),
-  fStrictDotEquivalents(NULL),
-  fStrictCommaEquivalents(NULL),
-  fStrictOtherGroupingSeparators(NULL),
-  fStrictDashEquivalents(NULL),
-  fDefaultGroupingSeparators(NULL),
-  fStrictDefaultGroupingSeparators(NULL),
-  fMinusSigns(NULL),
-  fPlusSigns(NULL)
-{
-    fDotEquivalents                = new UnicodeSet(UnicodeString(TRUE, gDotEquivalentsPattern, -1),                status);
-    fCommaEquivalents              = new UnicodeSet(UnicodeString(TRUE, gCommaEquivalentsPattern, -1),              status);
-    fOtherGroupingSeparators       = new UnicodeSet(UnicodeString(TRUE, gOtherGroupingSeparatorsPattern, -1),       status);
-    fDashEquivalents               = new UnicodeSet(UnicodeString(TRUE, gDashEquivalentsPattern, -1),               status);
-    
-    fStrictDotEquivalents          = new UnicodeSet(UnicodeString(TRUE, gStrictDotEquivalentsPattern, -1),          status);
-    fStrictCommaEquivalents        = new UnicodeSet(UnicodeString(TRUE, gStrictCommaEquivalentsPattern, -1),        status);
-    fStrictOtherGroupingSeparators = new UnicodeSet(UnicodeString(TRUE, gStrictOtherGroupingSeparatorsPattern, -1), status);
-    fStrictDashEquivalents         = new UnicodeSet(UnicodeString(TRUE, gStrictDashEquivalentsPattern, -1),         status);
-
-
-    fDefaultGroupingSeparators = new UnicodeSet(*fDotEquivalents);
-    fDefaultGroupingSeparators->addAll(*fCommaEquivalents);
-    fDefaultGroupingSeparators->addAll(*fOtherGroupingSeparators);
-
-    fStrictDefaultGroupingSeparators = new UnicodeSet(*fStrictDotEquivalents);
-    fStrictDefaultGroupingSeparators->addAll(*fStrictCommaEquivalents);
-    fStrictDefaultGroupingSeparators->addAll(*fStrictOtherGroupingSeparators);
-
-    fMinusSigns = new UnicodeSet();
-    fPlusSigns = new UnicodeSet();
-
-    // Check for null pointers
-    if (fDotEquivalents == NULL || fCommaEquivalents == NULL || fOtherGroupingSeparators == NULL || fDashEquivalents == NULL ||
-        fStrictDotEquivalents == NULL || fStrictCommaEquivalents == NULL || fStrictOtherGroupingSeparators == NULL || fStrictDashEquivalents == NULL ||
-        fDefaultGroupingSeparators == NULL || fStrictOtherGroupingSeparators == NULL ||
-        fMinusSigns == NULL || fPlusSigns == NULL) {
-      cleanup();
-      status = U_MEMORY_ALLOCATION_ERROR;
-      return;
-    }
-
-    initUnicodeSet(
-            gMinusSigns,
-            UPRV_LENGTHOF(gMinusSigns),
-            fMinusSigns);
-    initUnicodeSet(
-            gPlusSigns,
-            UPRV_LENGTHOF(gPlusSigns),
-            fPlusSigns);
-
-    // Freeze all the sets
-    fDotEquivalents->freeze();
-    fCommaEquivalents->freeze();
-    fOtherGroupingSeparators->freeze();
-    fDashEquivalents->freeze();
-    fStrictDotEquivalents->freeze();
-    fStrictCommaEquivalents->freeze();
-    fStrictOtherGroupingSeparators->freeze();
-    fStrictDashEquivalents->freeze();
-    fDefaultGroupingSeparators->freeze();
-    fStrictDefaultGroupingSeparators->freeze();
-    fMinusSigns->freeze();
-    fPlusSigns->freeze();
-}
-
-DecimalFormatStaticSets::~DecimalFormatStaticSets() {
-  cleanup();
-}
-
-void DecimalFormatStaticSets::cleanup() { // Be sure to clean up newly added fields!
-    delete fDotEquivalents; fDotEquivalents = NULL;
-    delete fCommaEquivalents; fCommaEquivalents = NULL;
-    delete fOtherGroupingSeparators; fOtherGroupingSeparators = NULL;
-    delete fDashEquivalents; fDashEquivalents = NULL;
-    delete fStrictDotEquivalents; fStrictDotEquivalents = NULL;
-    delete fStrictCommaEquivalents; fStrictCommaEquivalents = NULL;
-    delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL;
-    delete fStrictDashEquivalents; fStrictDashEquivalents = NULL;
-    delete fDefaultGroupingSeparators; fDefaultGroupingSeparators = NULL;
-    delete fStrictDefaultGroupingSeparators; fStrictDefaultGroupingSeparators = NULL;
-    delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL;
-    delete fMinusSigns; fMinusSigns = NULL;
-    delete fPlusSigns; fPlusSigns = NULL;
-}
-
-static DecimalFormatStaticSets *gStaticSets;
-static icu::UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
-
-
-//------------------------------------------------------------------------------
-//
-//   decfmt_cleanup     Memory cleanup function, free/delete all
-//                      cached memory.  Called by ICU's u_cleanup() function.
-//
-//------------------------------------------------------------------------------
-U_CDECL_BEGIN
-static UBool U_CALLCONV
-decimfmt_cleanup(void)
-{
-    delete gStaticSets;
-    gStaticSets = NULL;
-    gStaticSetsInitOnce.reset();
-    return TRUE;
-}
-
-static void U_CALLCONV initSets(UErrorCode &status) {
-    U_ASSERT(gStaticSets == NULL);
-    ucln_i18n_registerCleanup(UCLN_I18N_DECFMT, decimfmt_cleanup);
-    gStaticSets = new DecimalFormatStaticSets(status);
-    if (U_FAILURE(status)) {
-        delete gStaticSets;
-        gStaticSets = NULL;
-        return;
-    }
-    if (gStaticSets == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-}
-U_CDECL_END
-
-const DecimalFormatStaticSets *DecimalFormatStaticSets::getStaticSets(UErrorCode &status) {
-    umtx_initOnce(gStaticSetsInitOnce, initSets, status);
-    return gStaticSets;
-}
-
-
-const UnicodeSet *DecimalFormatStaticSets::getSimilarDecimals(UChar32 decimal, UBool strictParse)
-{
-    UErrorCode status = U_ZERO_ERROR;
-    umtx_initOnce(gStaticSetsInitOnce, initSets, status);
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
-
-    if (gStaticSets->fDotEquivalents->contains(decimal)) {
-        return strictParse ? gStaticSets->fStrictDotEquivalents : gStaticSets->fDotEquivalents;
-    }
-
-    if (gStaticSets->fCommaEquivalents->contains(decimal)) {
-        return strictParse ? gStaticSets->fStrictCommaEquivalents : gStaticSets->fCommaEquivalents;
-    }
-
-    // if there is no match, return NULL
-    return NULL;
-}
-
-
-U_NAMESPACE_END
-#endif   // !UCONFIG_NO_FORMATTING
diff --git a/icu4c/source/i18n/decfmtst.h b/icu4c/source/i18n/decfmtst.h

deleted file mode 100644 (file)

index 63ae50c..0000000
--- a/icu4c/source/i18n/decfmtst.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-*******************************************************************************
-* Copyright (C) 2009-2016, International Business Machines Corporation and
-* others. All Rights Reserved.
-*******************************************************************************
-*
-* This file contains declarations for the class DecimalFormatStaticSets
-*
-* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient
-* parsing of decimal and group separators.
-********************************************************************************
-*/
-
-#ifndef DECFMTST_H
-#define DECFMTST_H
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_FORMATTING
-
-#include "unicode/uobject.h"
-
-U_NAMESPACE_BEGIN
-
-class  UnicodeSet;
-
-
-class DecimalFormatStaticSets : public UMemory
-{
-public:
-    // Constructor and Destructor not for general use.
-    //   Public to permit access from plain C implementation functions.
-    DecimalFormatStaticSets(UErrorCode &status);
-    ~DecimalFormatStaticSets();
-
-    /**
-      * Return a pointer to a lazy-initialized singleton instance of this class.
-      */
-    static const DecimalFormatStaticSets *getStaticSets(UErrorCode &status);
-
-    static const UnicodeSet *getSimilarDecimals(UChar32 decimal, UBool strictParse);
-
-    UnicodeSet *fDotEquivalents;
-    UnicodeSet *fCommaEquivalents;
-    UnicodeSet *fOtherGroupingSeparators;
-    UnicodeSet *fDashEquivalents;
-
-    UnicodeSet *fStrictDotEquivalents;
-    UnicodeSet *fStrictCommaEquivalents;
-    UnicodeSet *fStrictOtherGroupingSeparators;
-    UnicodeSet *fStrictDashEquivalents;
-
-    UnicodeSet *fDefaultGroupingSeparators;
-    UnicodeSet *fStrictDefaultGroupingSeparators;
-
-    UnicodeSet *fMinusSigns;
-    UnicodeSet *fPlusSigns;
-private:
-    void cleanup();
-
-};
-
-
-U_NAMESPACE_END
-
-#endif   // !UCONFIG_NO_FORMATTING
-#endif   // DECFMTST_H
diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp

deleted file mode 100644 (file)

index eb2f6c1..0000000
--- a/icu4c/source/i18n/numparse_unisets.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// © 2018 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
-
-// Allow implicit conversion from char16_t* to UnicodeString for this file:
-// Helpful in toString methods and elsewhere.
-#define UNISTR_FROM_STRING_EXPLICIT
-
-#include "numparse_unisets.h"
-#include "numparse_types.h"
-#include "umutex.h"
-#include "ucln_in.h"
-#include "unicode/uniset.h"
-
-using namespace icu;
-using namespace icu::numparse;
-using namespace icu::numparse::impl;
-using namespace icu::numparse::impl::unisets;
-
-
-namespace {
-
-static UnicodeSet* gUnicodeSets[COUNT] = {};
-
-UnicodeSet* computeUnion(Key k1, Key k2) {
-    UnicodeSet* result = new UnicodeSet();
-    if (result == nullptr) {
-        return nullptr;
-    }
-    result->addAll(*gUnicodeSets[k1]);
-    result->addAll(*gUnicodeSets[k2]);
-    result->freeze();
-    return result;
-}
-
-UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
-    UnicodeSet* result = new UnicodeSet();
-    if (result == nullptr) {
-        return nullptr;
-    }
-    result->addAll(*gUnicodeSets[k1]);
-    result->addAll(*gUnicodeSets[k2]);
-    result->addAll(*gUnicodeSets[k3]);
-    result->freeze();
-    return result;
-}
-
-icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
-
-UBool U_CALLCONV cleanupNumberParseUniSets() {
-    for (int32_t i = 0; i < COUNT; i++) {
-        delete gUnicodeSets[i];
-        gUnicodeSets[i] = nullptr;
-    }
-    return TRUE;
-}
-
-void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
-    ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
-
-    gUnicodeSets[EMPTY] = new UnicodeSet();
-
-    // These characters are skipped over and ignored at any point in the string, even in strict mode.
-    // See ticket #13084.
-    gUnicodeSets[BIDI] = new UnicodeSet(u"[[:DI:]]", status);
-
-    // This set was decided after discussion with icu-design@. See ticket #13309.
-    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
-    gUnicodeSets[WHITESPACE] = new UnicodeSet(u"[[:Zs:][\\u0009]]", status);
-
-    gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE);
-    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(*gUnicodeSets[BIDI]);
-
-    // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
-    gUnicodeSets[COMMA] = new UnicodeSet(u"[,،٫、︐︑﹐﹑，､]", status);
-    gUnicodeSets[STRICT_COMMA] = new UnicodeSet(u"[,٫︐﹐，]", status);
-    gUnicodeSets[PERIOD] = new UnicodeSet(u"[.․。︒﹒．｡]", status);
-    gUnicodeSets[STRICT_PERIOD] = new UnicodeSet(u"[.․﹒．｡]", status);
-    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
-            u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
-    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
-    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
-            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
-
-    gUnicodeSets[MINUS_SIGN] = new UnicodeSet(u"[-⁻₋−➖﹣－]", status);
-    gUnicodeSets[PLUS_SIGN] = new UnicodeSet(u"[+⁺₊➕﬩﹢＋]", status);
-
-    gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
-    gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
-    gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
-
-    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
-    gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status);
-
-    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
-    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
-
-    for (int32_t i = 0; i < COUNT; i++) {
-        gUnicodeSets[i]->freeze();
-    }
-}
-
-}
-
-const UnicodeSet* unisets::get(Key key) {
-    UErrorCode localStatus = U_ZERO_ERROR;
-    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
-    if (U_FAILURE(localStatus)) {
-        // TODO: This returns non-null in Java, and callers assume that.
-        return nullptr;
-    }
-    return gUnicodeSets[key];
-}
-
-Key unisets::chooseFrom(UnicodeString str, Key key1) {
-    return get(key1)->contains(str) ? key1 : COUNT;
-}
-
-Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
-    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
-}
-
-
-#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/scientificnumberformatter.cpp b/icu4c/source/i18n/scientificnumberformatter.cpp

index adf032d989dd9068531dbbd4143d5ceb6be7c7b5..a63f15f6fb638bd8cf6e7039382564e1c40962ef 100644 (file)
--- a/icu4c/source/i18n/scientificnumberformatter.cpp
+++ b/icu4c/source/i18n/scientificnumberformatter.cpp
@@ -15,8 +15,8 @@
  #include "unicode/fpositer.h"
  #include "unicode/utf16.h"
  #include "unicode/uniset.h"
-#include "decfmtst.h"
  #include "unicode/decimfmt.h"
+#include "numparse_unisets.h"
  
  U_NAMESPACE_BEGIN
  
@@ -129,7 +129,6 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format(
          const UnicodeString &original,
          FieldPositionIterator &fpi,
          const UnicodeString &preExponent,
-        const DecimalFormatStaticSets &staticSets,
          UnicodeString &appendTo,
          UErrorCode &status) const {
      if (U_FAILURE(status)) {
@@ -149,16 +148,17 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format(
              break;
          case UNUM_EXPONENT_SIGN_FIELD:
              {
+                using namespace icu::numparse::impl;
                  int32_t beginIndex = fp.getBeginIndex();
                  int32_t endIndex = fp.getEndIndex();
                  UChar32 aChar = original.char32At(beginIndex);
-                if (staticSets.fMinusSigns->contains(aChar)) {
+                if (unisets::get(unisets::MINUS_SIGN)->contains(aChar)) {
                      appendTo.append(
                              original,
                              copyFromOffset,
                              beginIndex - copyFromOffset);
                      appendTo.append(kSuperscriptMinusSign);
-                } else if (staticSets.fPlusSigns->contains(aChar)) {
+                } else if (unisets::get(unisets::PLUS_SIGN)->contains(aChar)) {
                      appendTo.append(
                             original,
                             copyFromOffset,
@@ -203,7 +203,6 @@ UnicodeString &ScientificNumberFormatter::MarkupStyle::format(
          const UnicodeString &original,
          FieldPositionIterator &fpi,
          const UnicodeString &preExponent,
-        const DecimalFormatStaticSets & /*unusedDecimalFormatSets*/,
          UnicodeString &appendTo,
          UErrorCode &status) const {
      if (U_FAILURE(status)) {
@@ -243,8 +242,7 @@ ScientificNumberFormatter::ScientificNumberFormatter(
          DecimalFormat *fmtToAdopt, Style *styleToAdopt, UErrorCode &status)
          : fPreExponent(),
            fDecimalFormat(fmtToAdopt),
-          fStyle(styleToAdopt),
-          fStaticSets(NULL) {
+          fStyle(styleToAdopt) {
      if (U_FAILURE(status)) {
          return;
      }
@@ -258,7 +256,6 @@ ScientificNumberFormatter::ScientificNumberFormatter(
          return;
      }
      getPreExponent(*sym, fPreExponent);
-    fStaticSets = DecimalFormatStaticSets::getStaticSets(status);
  }
  
  ScientificNumberFormatter::ScientificNumberFormatter(
@@ -266,8 +263,7 @@ ScientificNumberFormatter::ScientificNumberFormatter(
          : UObject(other),
            fPreExponent(other.fPreExponent),
            fDecimalFormat(NULL),
-          fStyle(NULL),
-          fStaticSets(other.fStaticSets) {
+          fStyle(NULL) {
      fDecimalFormat = static_cast<DecimalFormat *>(
              other.fDecimalFormat->clone());
      fStyle = other.fStyle->clone();
@@ -292,7 +288,6 @@ UnicodeString &ScientificNumberFormatter::format(
              original,
              fpi,
              fPreExponent,
-            *fStaticSets,
              appendTo,
              status);
  }
diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h

index dc447ca89879997aca919ed5b90d0eaf78066035..318eafc143c9686a1c06c3bbcb6cdc9b4a1edb45 100644 (file)
--- a/icu4c/source/i18n/ucln_in.h
+++ b/icu4c/source/i18n/ucln_in.h
@@ -27,7 +27,6 @@ It's usually best to have child dependencies called first. */
  typedef enum ECleanupI18NType {
      UCLN_I18N_START = -1,
      UCLN_I18N_NUMBER_SKELETONS,
-    UCLN_I18N_NUMPARSE_UNISETS,
      UCLN_I18N_CURRENCY_SPACING,
      UCLN_I18N_SPOOF,
      UCLN_I18N_SPOOFDATA,
diff --git a/icu4c/source/i18n/unicode/scientificnumberformatter.h b/icu4c/source/i18n/unicode/scientificnumberformatter.h

index 15023d5141a470a652221532024816a357275bd9..6c34d2ce29a416c960fdd5b855fc6f7114f356d9 100644 (file)
--- a/icu4c/source/i18n/unicode/scientificnumberformatter.h
+++ b/icu4c/source/i18n/unicode/scientificnumberformatter.h
@@ -24,7 +24,6 @@
  U_NAMESPACE_BEGIN
  
  class FieldPositionIterator;
-class DecimalFormatStaticSets;
  class DecimalFormatSymbols;
  class DecimalFormat;
  class Formattable;
@@ -150,7 +149,6 @@ public:
                  const UnicodeString &original,
                  FieldPositionIterator &fpi,
                  const UnicodeString &preExponent,
-                const DecimalFormatStaticSets &decimalFormatSets,
                  UnicodeString &appendTo,
                  UErrorCode &status) const = 0;
      private:
@@ -165,7 +163,6 @@ public:
                  const UnicodeString &original,
                  FieldPositionIterator &fpi,
                  const UnicodeString &preExponent,
-                const DecimalFormatStaticSets &decimalFormatSets,
                  UnicodeString &appendTo,
                  UErrorCode &status) const;
      };
@@ -184,7 +181,6 @@ public:
                  const UnicodeString &original,
                  FieldPositionIterator &fpi,
                  const UnicodeString &preExponent,
-                const DecimalFormatStaticSets &decimalFormatSets,
                  UnicodeString &appendTo,
                  UErrorCode &status) const;
      private:
@@ -211,7 +207,6 @@ public:
      UnicodeString fPreExponent;
      DecimalFormat *fDecimalFormat;
      Style *fStyle;
-    const DecimalFormatStaticSets *fStaticSets;
  
  };
  
diff --git a/icu4c/source/test/intltest/numfmtst.cpp b/icu4c/source/test/intltest/numfmtst.cpp

index 0d07750a4eaa7cabae82a47602c115ba1eb7b1a5..0e259e0d2920360045305ab9dab71aeaaf062da2 100644 (file)
--- a/icu4c/source/test/intltest/numfmtst.cpp
+++ b/icu4c/source/test/intltest/numfmtst.cpp
@@ -1412,7 +1412,7 @@ static const char *lenientAffixTestCases[] = {
  static const char *lenientMinusTestCases[] = {
      "-5",
      "\\u22125",
-    "\\u20105"
+    "\\u27965"
  };
  
  static const char *lenientCurrencyTestCases[] = {
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java

index cba2dc9384911756a1daaad2ba2d14199b434fe7..0148b36347ded699d8a83f7c97cecbd09cc9e66f 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
@@ -5,7 +5,13 @@ package com.ibm.icu.impl.number.parse;
  import java.util.EnumMap;
  import java.util.Map;
  
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.impl.UResource;
+import com.ibm.icu.impl.UResource.Value;
  import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.UResourceBundle;
  
  /**
   * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
@@ -20,8 +26,6 @@ import com.ibm.icu.text.UnicodeSet;
  public class UnicodeSetStaticCache {
      public static enum Key {
          // Ignorables
-        BIDI,
-        WHITESPACE,
          DEFAULT_IGNORABLES,
          STRICT_IGNORABLES,
  
@@ -47,9 +51,14 @@ public class UnicodeSetStaticCache {
          PERMILLE_SIGN,
          INFINITY,
  
+        // Currency Symbols
+        DOLLAR_SIGN,
+        POUND_SIGN,
+        RUPEE_SIGN,
+        YEN_SIGN, // not in CLDR data, but Currency.java wants it
+
          // Other
          DIGITS,
-        CWCF, // TODO: Check if this is being used and remove it if not.
  
          // Combined Separators with Digits (for lead code points)
          DIGITS_OR_ALL_SEPARATORS,
@@ -70,6 +79,20 @@ public class UnicodeSetStaticCache {
          return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
      }
  
+    public static Key chooseCurrency(String str) {
+        if (get(Key.DOLLAR_SIGN).contains(str)) {
+            return Key.DOLLAR_SIGN;
+        } else if (get(Key.POUND_SIGN).contains(str)) {
+            return Key.POUND_SIGN;
+        } else if (get(Key.RUPEE_SIGN).contains(str)) {
+            return Key.RUPEE_SIGN;
+        } else if (get(Key.YEN_SIGN).contains(str)) {
+            return Key.YEN_SIGN;
+        } else {
+            return null;
+        }
+    }
+
      private static UnicodeSet computeUnion(Key k1, Key k2) {
          return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
      }
@@ -78,23 +101,98 @@ public class UnicodeSetStaticCache {
          return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
      }
  
-    static {
-        // These characters are skipped over and ignored at any point in the string, even in strict mode.
-        // See ticket #13084.
-        unicodeSets.put(Key.BIDI, new UnicodeSet("[[:DI:]]").freeze());
+    private static void saveSet(Key key, String unicodeSetPattern) {
+        assert unicodeSets.get(key) == null;
+        unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze());
+    }
  
-        // This set was decided after discussion with icu-design@. See ticket #13309.
+    /*
+    parse{
+        date{
+            lenient{
+                "[\\--/]",
+                "[\\:∶]",
+            }
+        }
+        general{
+            lenient{
+                "[.․。︒﹒．｡]",
+                "[\$﹩＄$]",
+                "[£₤]",
+                "[₨₹{Rp}{Rs}]",
+            }
+        }
+        number{
+            lenient{
+                "[\\-‒⁻₋−➖﹣－]",
+                "[,،٫、︐︑﹐﹑，､]",
+                "[+⁺₊➕﬩﹢＋]",
+            }
+            stricter{
+                "[,٫︐﹐，]",
+                "[.․﹒．｡]",
+            }
+        }
+    }
+     */
+    static class ParseDataSink extends UResource.Sink {
+        @Override
+        public void put(com.ibm.icu.impl.UResource.Key key, Value value, boolean noFallback) {
+            UResource.Table contextsTable = value.getTable();
+            for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
+                if (key.contentEquals("date")) {
+                    // ignore
+                } else {
+                    assert key.contentEquals("general") || key.contentEquals("number");
+                    UResource.Table strictnessTable = value.getTable();
+                    for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
+                        boolean isLenient = key.contentEquals("lenient");
+                        UResource.Array array = value.getArray();
+                        for (int k = 0; k < array.getSize(); k++) {
+                            array.getValue(k, value);
+                            String str = value.toString();
+                            // There is both lenient and strict data for comma/period,
+                            // but not for any of the other symbols.
+                            if (str.indexOf('.') != -1) {
+                                saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str);
+                            } else if (str.indexOf(',') != -1) {
+                                saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
+                            } else if (str.indexOf('+') != -1) {
+                                saveSet(Key.PLUS_SIGN, str);
+                            } else if (str.indexOf('‒') != -1) {
+                                saveSet(Key.MINUS_SIGN, str);
+                            } else if (str.indexOf('$') != -1) {
+                                saveSet(Key.DOLLAR_SIGN, str);
+                            } else if (str.indexOf('£') != -1) {
+                                saveSet(Key.POUND_SIGN, str);
+                            } else if (str.indexOf('₨') != -1) {
+                                saveSet(Key.RUPEE_SIGN, str);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    static {
+        // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
          // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
-        unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze());
+        unicodeSets.put(Key.DEFAULT_IGNORABLES,
+                new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze());
+        unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze());
  
-        unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE));
-        unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI));
+        // CLDR provides data for comma, period, minus sign, and plus sign.
+        ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
+                .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT);
+        rb.getAllItemsWithFallback("parse", new ParseDataSink());
+
+        // TODO: Should there be fallback behavior if for some reason these sets didn't get populated?
+        assert unicodeSets.containsKey(Key.COMMA);
+        assert unicodeSets.containsKey(Key.STRICT_COMMA);
+        assert unicodeSets.containsKey(Key.PERIOD);
+        assert unicodeSets.containsKey(Key.STRICT_PERIOD);
  
-        // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
-        unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑，､]").freeze());
-        unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐，]").freeze());
-        unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒．｡]").freeze());
-        unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒．｡]").freeze());
          unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
                  new UnicodeSet("['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze());
          unicodeSets.put(Key.ALL_SEPARATORS,
@@ -102,15 +200,19 @@ public class UnicodeSetStaticCache {
          unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
                  computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
  
-        unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣－]").freeze());
-        unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢＋]").freeze());
+        assert unicodeSets.containsKey(Key.MINUS_SIGN);
+        assert unicodeSets.containsKey(Key.PLUS_SIGN);
  
          unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
          unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
          unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
  
+        assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
+        assert unicodeSets.containsKey(Key.POUND_SIGN);
+        assert unicodeSets.containsKey(Key.RUPEE_SIGN);
+        unicodeSets.put(Key.YEN_SIGN, new UnicodeSet("[¥\\uffe5]").freeze());
+
          unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
-        unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());
  
          unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
          unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java b/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java

index a05cebd804da31352d59225cc01b89804a4a73da..03febbe1433fa2da2fdd439ee7f0de4a96c8b424 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java
@@ -14,7 +14,6 @@ import java.text.ParsePosition;
  import java.util.ArrayList;
  import java.util.Collections;
  import java.util.Date;
-import java.util.HashMap;
  import java.util.HashSet;
  import java.util.Iterator;
  import java.util.List;
@@ -31,10 +30,12 @@ import com.ibm.icu.impl.ICUResourceBundle;
  import com.ibm.icu.impl.SimpleCache;
  import com.ibm.icu.impl.SoftCache;
  import com.ibm.icu.impl.TextTrieMap;
+import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache;
  import com.ibm.icu.text.CurrencyDisplayNames;
  import com.ibm.icu.text.CurrencyMetaInfo;
  import com.ibm.icu.text.CurrencyMetaInfo.CurrencyDigits;
  import com.ibm.icu.text.CurrencyMetaInfo.CurrencyFilter;
+import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.util.ULocale.Category;
  
  /**
@@ -98,13 +99,6 @@ public class Currency extends MeasureUnit {
       */
      public static final int NARROW_SYMBOL_NAME = 3;
  
-    private static final EquivalenceRelation<String> EQUIVALENT_CURRENCY_SYMBOLS =
-            new EquivalenceRelation<String>()
-            .add("\u00a5", "\uffe5")
-            .add("$", "\ufe69", "\uff04")
-            .add("\u20a8", "\u20b9")
-            .add("\u00a3", "\u20a4");
-
      /**
       * Currency Usage used for Decimal Format
       * @stable ICU 54
@@ -778,8 +772,16 @@ public class Currency extends MeasureUnit {
              String isoCode = e.getValue();
              // Register under not just symbol, but under every equivalent symbol as well
              // e.g short width yen and long width yen.
-            for (String equivalentSymbol : EQUIVALENT_CURRENCY_SYMBOLS.get(symbol)) {
-                symTrie.put(equivalentSymbol, new CurrencyStringInfo(isoCode, symbol));
+            UnicodeSetStaticCache.Key key = UnicodeSetStaticCache.chooseCurrency(symbol);
+            CurrencyStringInfo value = new CurrencyStringInfo(isoCode, symbol);
+            if (key != null) {
+                UnicodeSet equivalents = UnicodeSetStaticCache.get(key);
+                // The symbol itself is included in the UnicodeSet
+                for (String equivalentSymbol : equivalents) {
+                    symTrie.put(equivalentSymbol, value);
+                }
+            } else {
+                symTrie.put(symbol, value);
              }
          }
          for (Map.Entry<String, String> e : names.nameMap().entrySet()) {
@@ -1039,34 +1041,6 @@ public class Currency extends MeasureUnit {
          return info.currencies(filter.withTender());
      }
  
-    private static final class EquivalenceRelation<T> {
-
-        private Map<T, Set<T>> data = new HashMap<T, Set<T>>();
-
-        @SuppressWarnings("unchecked")  // See ticket #11395, this is safe.
-        public EquivalenceRelation<T> add(T... items) {
-            Set<T> group = new HashSet<T>();
-            for (T item : items) {
-                if (data.containsKey(item)) {
-                    throw new IllegalArgumentException("All groups passed to add must be disjoint.");
-                }
-                group.add(item);
-            }
-            for (T item : items) {
-                data.put(item, group);
-            }
-            return this;
-        }
-
-        public Set<T> get(T item) {
-            Set<T> result = data.get(item);
-            if (result == null) {
-                return Collections.singleton(item);
-            }
-            return Collections.unmodifiableSet(result);
-        }
-    }
-
      private Object writeReplace() throws ObjectStreamException {
          return new MeasureUnitProxy(type, subType);
      }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index b442d3b3029405cea6ec3576f27e949c668984ef..de6c39277790afb05a1307612255977ae4a80b7f 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -1764,7 +1764,7 @@ public class NumberFormatTest extends TestFmwk {
          }
  
          // Test default ignorable characters.  These should work in both lenient and strict.
-        UnicodeSet defaultIgnorables = new UnicodeSet("[[:Default_Ignorable_Code_Point:]]").freeze();
+        UnicodeSet defaultIgnorables = new UnicodeSet("[[:Bidi_Control:]]").freeze();
          fmt.setParseStrict(false);
          for (String ignorable : defaultIgnorables) {
              String str = "a b " + ignorable + "1234c  ";
author	Shane Carr <shane@unicode.org>
	Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)
committer	Shane Carr <shane@unicode.org>
	Thu, 12 Apr 2018 10:59:37 +0000 (10:59 +0000)
icu4c/source/common/Makefile.in		patch \| blob \| history
icu4c/source/common/numparse_unisets.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/common/numparse_unisets.h	[moved from icu4c/source/i18n/numparse_unisets.h with 71% similarity]	patch \| blob \| history
icu4c/source/common/ucln_cmn.h		patch \| blob \| history
icu4c/source/common/ucurr.cpp		patch \| blob \| history
icu4c/source/i18n/Makefile.in		patch \| blob \| history
icu4c/source/i18n/decfmtst.cpp	[deleted file]	patch \| blob \| history
icu4c/source/i18n/decfmtst.h	[deleted file]	patch \| blob \| history
icu4c/source/i18n/numparse_unisets.cpp	[deleted file]	patch \| blob \| history
icu4c/source/i18n/scientificnumberformatter.cpp		patch \| blob \| history
icu4c/source/i18n/ucln_in.h		patch \| blob \| history
icu4c/source/i18n/unicode/scientificnumberformatter.h		patch \| blob \| history
icu4c/source/test/intltest/numfmtst.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history