]> granicus.if.org Git - icu/commitdiff
ICU-13574 Adding composition matchers (SeriesMatcher and AnyMatcher) to ICU4C in...
authorShane Carr <shane@unicode.org>
Sat, 10 Feb 2018 06:36:07 +0000 (06:36 +0000)
committerShane Carr <shane@unicode.org>
Sat, 10 Feb 2018 06:36:07 +0000 (06:36 +0000)
X-SVN-Rev: 40890

25 files changed:
icu4c/source/i18n/Makefile.in
icu4c/source/i18n/numparse_affixes.cpp [new file with mode: 0644]
icu4c/source/i18n/numparse_affixes.h [new file with mode: 0644]
icu4c/source/i18n/numparse_compositions.cpp [new file with mode: 0644]
icu4c/source/i18n/numparse_compositions.h [new file with mode: 0644]
icu4c/source/i18n/numparse_currency.cpp
icu4c/source/i18n/numparse_currency.h
icu4c/source/i18n/numparse_decimal.cpp
icu4c/source/i18n/numparse_decimal.h
icu4c/source/i18n/numparse_impl.cpp
icu4c/source/i18n/numparse_impl.h
icu4c/source/i18n/numparse_scientific.cpp
icu4c/source/i18n/numparse_scientific.h
icu4c/source/i18n/numparse_symbols.cpp
icu4c/source/i18n/numparse_symbols.h
icu4c/source/i18n/numparse_types.h
icu4c/source/test/intltest/intltest.cpp
icu4c/source/test/intltest/intltest.h
icu4c/source/test/intltest/numbertest.h
icu4c/source/test/intltest/numbertest_parse.cpp
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java [moved from icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java with 86% similarity]
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java [moved from icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java with 77% similarity]
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherFactory.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java

index a24adbeb08dedab03cadbd36c2e6b525a8248ce4..d05b907c368a32cfdbba2e9cafaa386c6684c762 100644 (file)
@@ -110,7 +110,7 @@ number_padding.o number_patternmodifier.o number_patternstring.o \
 number_rounding.o number_scientific.o number_stringbuilder.o \
 numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \
 numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o \
-numparse_currency.o
+numparse_currency.o numparse_affixes.o numparse_compositions.o
 
 
 ## Header files to install
diff --git a/icu4c/source/i18n/numparse_affixes.cpp b/icu4c/source/i18n/numparse_affixes.cpp
new file mode 100644 (file)
index 0000000..2ac929d
--- /dev/null
@@ -0,0 +1,20 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_types.h"
+#include "numparse_affixes.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+
+
+
+
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_affixes.h b/icu4c/source/i18n/numparse_affixes.h
new file mode 100644 (file)
index 0000000..677b50c
--- /dev/null
@@ -0,0 +1,25 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_AFFIXES_H__
+#define __NUMPARSE_AFFIXES_H__
+
+#include "numparse_types.h"
+
+U_NAMESPACE_BEGIN
+namespace numparse {
+namespace impl {
+
+
+
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_AFFIXES_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_compositions.cpp b/icu4c/source/i18n/numparse_compositions.cpp
new file mode 100644 (file)
index 0000000..5d4a92b
--- /dev/null
@@ -0,0 +1,108 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_types.h"
+#include "numparse_compositions.h"
+#include "unicode/uniset.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+
+
+bool AnyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
+    int32_t initialOffset = segment.getOffset();
+    bool maybeMore = false;
+
+    // NOTE: The range-based for loop calls the virtual begin() and end() methods.
+    for (auto* matcher : *this) {
+        maybeMore = maybeMore || matcher->match(segment, result, status);
+        if (segment.getOffset() != initialOffset) {
+            // Match succeeded.
+            // NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will
+            // accept any string starting with A. Therefore, there is no possibility that matchers
+            // later in the list may be evaluated on longer strings, and we can exit the loop here.
+            break;
+        }
+    }
+
+    // None of the matchers succeeded.
+    return maybeMore;
+}
+
+void AnyMatcher::postProcess(ParsedNumber& result) const {
+    // NOTE: The range-based for loop calls the virtual begin() and end() methods.
+    for (auto* matcher : *this) {
+        matcher->postProcess(result);
+    }
+}
+
+
+bool SeriesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
+    ParsedNumber backup(result);
+
+    int32_t initialOffset = segment.getOffset();
+    bool maybeMore = true;
+    for (auto* it = begin(); it < end();) {
+        const NumberParseMatcher* matcher = *it;
+        int matcherOffset = segment.getOffset();
+        if (segment.length() != 0) {
+            maybeMore = matcher->match(segment, result, status);
+        } else {
+            // Nothing for this matcher to match; ask for more.
+            maybeMore = true;
+        }
+
+        bool success = (segment.getOffset() != matcherOffset);
+        bool isFlexible = matcher->isFlexible();
+        if (success && isFlexible) {
+            // Match succeeded, and this is a flexible matcher. Re-run it.
+        } else if (success) {
+            // Match succeeded, and this is NOT a flexible matcher. Proceed to the next matcher.
+            it++;
+        } else if (isFlexible) {
+            // Match failed, and this is a flexible matcher. Try again with the next matcher.
+            it++;
+        } else {
+            // Match failed, and this is NOT a flexible matcher. Exit.
+            segment.setOffset(initialOffset);
+            result = backup;
+            return maybeMore;
+        }
+    }
+
+    // All matchers in the series succeeded.
+    return maybeMore;
+}
+
+void SeriesMatcher::postProcess(ParsedNumber& result) const {
+    // NOTE: The range-based for loop calls the virtual begin() and end() methods.
+    for (auto* matcher : *this) {
+        matcher->postProcess(result);
+    }
+}
+
+
+ArraySeriesMatcher::ArraySeriesMatcher(NumberParseMatcher** matchers, int32_t matchersLen)
+        : fMatchers(matchers), fMatchersLen(matchersLen) {}
+
+const UnicodeSet& ArraySeriesMatcher::getLeadCodePoints() {
+    // SeriesMatchers are never allowed to start with a Flexible matcher.
+    U_ASSERT(!fMatchers[0]->isFlexible());
+    return fMatchers[0]->getLeadCodePoints();
+}
+
+const NumberParseMatcher* const* ArraySeriesMatcher::begin() const {
+    return fMatchers.getAlias();
+}
+
+const NumberParseMatcher* const* ArraySeriesMatcher::end() const {
+    return fMatchers.getAlias() + fMatchersLen;
+}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_compositions.h b/icu4c/source/i18n/numparse_compositions.h
new file mode 100644 (file)
index 0000000..b52bb2f
--- /dev/null
@@ -0,0 +1,100 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __SOURCE_NUMPARSE_COMPOSITIONS__
+#define __SOURCE_NUMPARSE_COMPOSITIONS__
+
+#include "numparse_types.h"
+
+U_NAMESPACE_BEGIN namespace numparse {
+namespace impl {
+
+
+/**
+ * Base class for AnyMatcher and SeriesMatcher.
+ */
+class CompositionMatcher : public NumberParseMatcher {
+  protected:
+    // No construction except by subclasses!
+    CompositionMatcher() = default;
+
+    // To be overridden by subclasses (used for iteration):
+    virtual const NumberParseMatcher* const* begin() const = 0;
+
+    // To be overridden by subclasses (used for iteration):
+    virtual const NumberParseMatcher* const* end() const = 0;
+};
+
+
+/**
+ * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
+ * the first matcher in the list to succeed.
+ *
+ * NOTE: In C++, this is a base class, unlike ICU4J, which uses a factory-style interface.
+ *
+ * @author sffc
+ * @see SeriesMatcher
+ */
+class AnyMatcher : public CompositionMatcher {
+  public:
+    bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+
+    void postProcess(ParsedNumber& result) const override;
+
+  protected:
+    // No construction except by subclasses!
+    AnyMatcher() = default;
+};
+
+
+/**
+ * Composes a number of matchers, running one after another. Matches the input string only if all of the
+ * matchers in the series succeed. Performs greedy matches within the context of the series.
+ *
+ * @author sffc
+ * @see AnyMatcher
+ */
+class SeriesMatcher : public CompositionMatcher {
+  public:
+    bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+
+    void postProcess(ParsedNumber& result) const override;
+
+  protected:
+    // No construction except by subclasses!
+    SeriesMatcher() = default;
+};
+
+
+/**
+ * An implementation of SeriesMatcher that references an array of matchers.
+ *
+ * The object adopts the array, but NOT the matchers contained inside the array.
+ */
+class ArraySeriesMatcher : public SeriesMatcher {
+  public:
+    /** The array is adopted, but NOT the matchers inside the array. */
+    ArraySeriesMatcher(NumberParseMatcher** matchers, int32_t matchersLen);
+
+    const UnicodeSet& getLeadCodePoints() override;
+
+  protected:
+    const NumberParseMatcher* const* begin() const override;
+
+    const NumberParseMatcher* const* end() const override;
+
+  private:
+    LocalArray<NumberParseMatcher*> fMatchers;
+    int32_t fMatchersLen;
+};
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__SOURCE_NUMPARSE_COMPOSITIONS__
+#endif /* #if !UCONFIG_NO_FORMATTING */
index 7a78a3bbb7d3d03ce3bd52954e803c469759b7ea..90b6bed6dd03afa32aed7c519a8c350139eae4ea 100644 (file)
@@ -9,12 +9,23 @@
 #include "numparse_currency.h"
 #include "ucurrimp.h"
 #include "unicode/errorcode.h"
+#include "numparse_utils.h"
 
 using namespace icu;
 using namespace icu::numparse;
 using namespace icu::numparse::impl;
 
 
+namespace {
+
+inline void copyCurrencyCode(UChar* dest, const UChar* src) {
+    uprv_memcpy(dest, src, sizeof(UChar) * 3);
+    dest[3] = 0;
+}
+
+}
+
+
 CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status)
         : fLocaleName(locale.getName(), -1, status) {}
 
@@ -52,15 +63,84 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U
     return partialMatch;
 }
 
-const UnicodeSet* CurrencyNamesMatcher::getLeadCodePoints() const {
-    ErrorCode status;
-    UnicodeSet* leadCodePoints = new UnicodeSet();
-    uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
-    // Always apply case mapping closure for currencies
-    leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
-    leadCodePoints->freeze();
+const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() {
+    if (fLocalLeadCodePoints.isNull()) {
+        ErrorCode status;
+        auto* leadCodePoints = new UnicodeSet();
+        uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
+        // Always apply case mapping closure for currencies
+        leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
+        leadCodePoints->freeze();
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
+    }
+    return *fLocalLeadCodePoints;
+}
+
+
+CurrencyCustomMatcher::CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1,
+                                             const UnicodeString& currency2)
+        : fCurrency1(currency1), fCurrency2(currency2) {
+    copyCurrencyCode(fCurrencyCode, currencyCode);
+}
+
+bool CurrencyCustomMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
+    if (result.currencyCode[0] != 0) {
+        return false;
+    }
+
+    int overlap1 = segment.getCommonPrefixLength(fCurrency1);
+    if (overlap1 == fCurrency1.length()) {
+        copyCurrencyCode(result.currencyCode, fCurrencyCode);
+        segment.adjustOffset(overlap1);
+        result.setCharsConsumed(segment);
+    }
+
+    int overlap2 = segment.getCommonPrefixLength(fCurrency2);
+    if (overlap2 == fCurrency2.length()) {
+        copyCurrencyCode(result.currencyCode, fCurrencyCode);
+        segment.adjustOffset(overlap2);
+        result.setCharsConsumed(segment);
+    }
+
+    return overlap1 == segment.length() || overlap2 == segment.length();
+}
+
+const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() {
+    if (fLocalLeadCodePoints.isNull()) {
+        auto* leadCodePoints = new UnicodeSet();
+        utils::putLeadCodePoint(fCurrency1, leadCodePoints);
+        utils::putLeadCodePoint(fCurrency2, leadCodePoints);
+        leadCodePoints->freeze();
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
+    }
+    return *fLocalLeadCodePoints;
+}
+
+
+CurrencyAnyMatcher::CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher,
+                                       CurrencyCustomMatcher customMatcher)
+        : fNamesMatcher(std::move(namesMatcher)), fCustomMatcher(std::move(customMatcher)) {
+    fMatcherArray[0] = &fNamesMatcher;
+    fMatcherArray[1] = &fCustomMatcher;
+}
+
+const UnicodeSet& CurrencyAnyMatcher::getLeadCodePoints() {
+    if (fLocalLeadCodePoints.isNull()) {
+        auto* leadCodePoints = new UnicodeSet();
+        leadCodePoints->addAll(fNamesMatcher.getLeadCodePoints());
+        leadCodePoints->addAll(fCustomMatcher.getLeadCodePoints());
+        leadCodePoints->freeze();
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
+    }
+    return *fLocalLeadCodePoints;
+}
+
+const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const {
+    return fMatcherArray;
+}
 
-    return leadCodePoints;
+const NumberParseMatcher* const* CurrencyAnyMatcher::end() const {
+    return fMatcherArray + 2;
 }
 
 
index 49b367a8964509616092e9cc1caa4604bd617ee7..f5f56c860049dd85f0151390d24072b6873538d6 100644 (file)
@@ -8,6 +8,7 @@
 #define __NUMPARSE_CURRENCY_H__
 
 #include "numparse_types.h"
+#include "numparse_compositions.h"
 #include "charstr.h"
 
 U_NAMESPACE_BEGIN namespace numparse {
@@ -29,7 +30,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
 
     bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
 
-    const UnicodeSet* getLeadCodePoints() const override;
+    const UnicodeSet& getLeadCodePoints() override;
 
   private:
     // We could use Locale instead of CharString here, but
@@ -39,6 +40,45 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
 };
 
 
+class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory {
+  public:
+    CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1,
+                          const UnicodeString& currency2);
+
+    bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+
+    const UnicodeSet& getLeadCodePoints() override;
+
+  private:
+    UChar fCurrencyCode[4];
+    UnicodeString fCurrency1;
+    UnicodeString fCurrency2;
+};
+
+
+/**
+ * An implementation of AnyMatcher, allowing for either currency data or locale currency matches.
+ */
+class CurrencyAnyMatcher : public AnyMatcher, public UMemory {
+  public:
+    /** Calls std::move on the two arguments. */
+    CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher, CurrencyCustomMatcher customMatcher);
+
+    const UnicodeSet& getLeadCodePoints() override;
+
+  protected:
+    const NumberParseMatcher* const* begin() const override;
+
+    const NumberParseMatcher* const* end() const override;
+
+  private:
+    CurrencyNamesMatcher fNamesMatcher;
+    CurrencyCustomMatcher fCustomMatcher;
+
+    const NumberParseMatcher* fMatcherArray[2];
+};
+
+
 } // namespace impl
 } // namespace numparse
 U_NAMESPACE_END
index bfc9c4f8a712e2792bae3286eb350277e573ce7c..e80014fa59135b292bac0cc74f512c8302a5ea39 100644 (file)
@@ -291,22 +291,25 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
     return segment.length() == 0 || hasPartialPrefix;
 }
 
-const UnicodeSet* DecimalMatcher::getLeadCodePoints() const {
+const UnicodeSet& DecimalMatcher::getLeadCodePoints() {
     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
-        return new UnicodeSet(*leadSet);
+        return *leadSet;
     }
 
-    auto* leadCodePoints = new UnicodeSet();
-    // Assumption: the sets are all single code points.
-    leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
-    leadCodePoints->addAll(*separatorSet);
-    if (!fLocalDigitStrings.isNull()) {
-        for (int i = 0; i < 10; i++) {
-            utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
+    if (fLocalLeadCodePoints.isNull()) {
+        auto* leadCodePoints = new UnicodeSet();
+        // Assumption: the sets are all single code points.
+        leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
+        leadCodePoints->addAll(*separatorSet);
+        if (!fLocalDigitStrings.isNull()) {
+            for (int i = 0; i < 10; i++) {
+                utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
+            }
         }
+        leadCodePoints->freeze();
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
     }
-    leadCodePoints->freeze();
-    return leadCodePoints;
+    return *fLocalLeadCodePoints;
 }
 
 
index 9423a7786c94d0ce6b51e73b63363ae4836a911b..203cb66b4b44ef24b718cf2cca7beeaabb43e474 100644 (file)
@@ -27,7 +27,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
     bool
     match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, UErrorCode& status) const;
 
-    const UnicodeSet* getLeadCodePoints() const override;
+    const UnicodeSet& getLeadCodePoints() override;
 
   private:
     /** If true, only accept strings whose grouping sizes match the locale */
@@ -56,7 +56,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
     const UnicodeSet* leadSet;
 
     // Make this class the owner of a few objects that could be allocated.
-    // The first two LocalPointers are used for assigning ownership only.
+    // The first three LocalPointers are used for assigning ownership only.
     LocalPointer<const UnicodeSet> fLocalDecimalUniSet;
     LocalPointer<const UnicodeSet> fLocalSeparatorSet;
     LocalArray<const UnicodeString> fLocalDigitStrings;
index 2fe84fcbc978d85fe4e3746ae1f98561310d8ace..efa9b3cab2f2555ef08061fd551c2ba891c1d565 100644 (file)
@@ -32,7 +32,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
     auto* parser = new NumberParserImpl(parseFlags, true);
     DecimalFormatSymbols symbols(locale, status);
 
-    parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
+    parser->fLocalMatchers.ignorables = std::move(IgnorablesMatcher(unisets::DEFAULT_IGNORABLES));
 
 //    MatcherFactory factory = new MatcherFactory();
 //    factory.currency = Currency.getInstance("USD");
@@ -78,7 +78,7 @@ NumberParserImpl::~NumberParserImpl() {
     fNumMatchers = 0;
 }
 
-void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) {
+void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
     if (fNumMatchers + 1 > fMatchers.getCapacity()) {
         fMatchers.resize(fNumMatchers * 2, fNumMatchers);
         if (fComputeLeads) {
@@ -97,17 +97,17 @@ void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) {
     fNumMatchers++;
 }
 
-void NumberParserImpl::addLeadCodePointsForMatcher(const NumberParseMatcher& matcher) {
-    const UnicodeSet* leadCodePoints = matcher.getLeadCodePoints();
+void NumberParserImpl::addLeadCodePointsForMatcher(NumberParseMatcher& matcher) {
+    const UnicodeSet& leadCodePoints = matcher.getLeadCodePoints();
     // TODO: Avoid the clone operation here.
     if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) {
-        UnicodeSet* copy = static_cast<UnicodeSet*>(leadCodePoints->cloneAsThawed());
-        delete leadCodePoints;
+        auto* copy = dynamic_cast<UnicodeSet*>(leadCodePoints.cloneAsThawed());
         copy->closeOver(USET_ADD_CASE_MAPPINGS);
         copy->freeze();
         fLeads[fNumMatchers] = copy;
     } else {
-        fLeads[fNumMatchers] = leadCodePoints;
+        // FIXME: new here because we still take ownership
+        fLeads[fNumMatchers] = new UnicodeSet(leadCodePoints);
     }
 }
 
index 0fe45fa5f4246144110f04e84aebbaf098d4c623..abc826f590bc7d5e43cf29d17c6a571fe0f33799 100644 (file)
@@ -24,7 +24,7 @@ class NumberParserImpl {
     static NumberParserImpl* createSimpleParser(const Locale& locale, const UnicodeString& patternString,
                                                 parse_flags_t parseFlags, UErrorCode& status);
 
-    void addMatcher(const NumberParseMatcher& matcher);
+    void addMatcher(NumberParseMatcher& matcher);
 
     void freeze();
 
@@ -62,7 +62,7 @@ class NumberParserImpl {
 
     NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
 
-    void addLeadCodePointsForMatcher(const NumberParseMatcher& matcher);
+    void addLeadCodePointsForMatcher(NumberParseMatcher& matcher);
 
     void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
 
index 3b69dcdc9973213d429ca1513afa9a9f53bd2011..18ade048fb2e851e8828e0e04be22f97958b3e6b 100644 (file)
@@ -67,17 +67,20 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
     return false;
 }
 
-const UnicodeSet* ScientificMatcher::getLeadCodePoints() const {
+const UnicodeSet& ScientificMatcher::getLeadCodePoints() {
     UChar32 leadCp = fExponentSeparatorString.char32At(0);
     const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
     if (s->contains(leadCp)) {
-        return new UnicodeSet(*s);
-    } else {
-        UnicodeSet* leadCodePoints = new UnicodeSet();
+        return *s;
+    }
+
+    if (fLocalLeadCodePoints.isNull()) {
+        auto* leadCodePoints = new UnicodeSet();
         leadCodePoints->add(leadCp);
         leadCodePoints->freeze();
-        return leadCodePoints;
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
     }
+    return *fLocalLeadCodePoints;
 }
 
 
index 544386c7c39c4b391ebe17685b20af6228496044..2f4118ff61874af6b7d104d527ab81b9ff8ff672 100644 (file)
@@ -25,7 +25,7 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory {
 
     bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
 
-    const UnicodeSet* getLeadCodePoints() const override;
+    const UnicodeSet& getLeadCodePoints() override;
 
   private:
     UnicodeString fExponentSeparatorString;
index 8e192cf7736756416602b06cb1973c8da69097d5..6654bea7de088ef0704f200830a160f3d4f69846 100644 (file)
@@ -54,17 +54,20 @@ bool SymbolMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo
     return overlap == segment.length();
 }
 
-const UnicodeSet* SymbolMatcher::getLeadCodePoints() const {
+const UnicodeSet& SymbolMatcher::getLeadCodePoints() {
     if (fString.isEmpty()) {
         // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
-        return new UnicodeSet(*fUniSet);
+        return *fUniSet;
     }
 
-    UnicodeSet* leadCodePoints = new UnicodeSet();
-    utils::putLeadCodePoints(fUniSet, leadCodePoints);
-    utils::putLeadCodePoint(fString, leadCodePoints);
-    leadCodePoints->freeze();
-    return leadCodePoints;
+    if (fLocalLeadCodePoints.isNull()) {
+        auto* leadCodePoints = new UnicodeSet();
+        utils::putLeadCodePoints(fUniSet, leadCodePoints);
+        utils::putLeadCodePoint(fString, leadCodePoints);
+        leadCodePoints->freeze();
+        fLocalLeadCodePoints.adoptInstead(leadCodePoints);
+    }
+    return *fLocalLeadCodePoints;
 }
 
 
@@ -86,7 +89,7 @@ void IgnorablesMatcher::accept(StringSegment&, ParsedNumber&) const {
 
 
 InfinityMatcher::InfinityMatcher(const DecimalFormatSymbols& dfs)
-        : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::INFINITY) {
+        : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol), unisets::INFINITY) {
 }
 
 bool InfinityMatcher::isDisabled(const ParsedNumber& result) const {
@@ -118,15 +121,15 @@ NanMatcher::NanMatcher(const DecimalFormatSymbols& dfs)
         : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::EMPTY) {
 }
 
-const UnicodeSet* NanMatcher::getLeadCodePoints() const {
+const UnicodeSet& NanMatcher::getLeadCodePoints() {
     // Overriding this here to allow use of statically allocated sets
     int leadCp = fString.char32At(0);
     const UnicodeSet* s = unisets::get(unisets::NAN_LEAD);
     if (s->contains(leadCp)) {
-        return new UnicodeSet(*s);
-    } else {
-        return SymbolMatcher::getLeadCodePoints();
+        return *s;
     }
+
+    return SymbolMatcher::getLeadCodePoints();
 }
 
 bool NanMatcher::isDisabled(const ParsedNumber& result) const {
@@ -146,11 +149,11 @@ bool PaddingMatcher::isFlexible() const {
     return true;
 }
 
-bool PaddingMatcher::isDisabled(const ParsedNumber& result) const {
+bool PaddingMatcher::isDisabled(const ParsedNumber&) const {
     return false;
 }
 
-void PaddingMatcher::accept(StringSegment& segment, ParsedNumber& result) const {
+void PaddingMatcher::accept(StringSegment&, ParsedNumber&) const {
     // No-op
 }
 
index 40a57f02baf7f046a75221572280b57c39d662dc..289b8902d963961d0463fa1a0acc6484203dcd2b 100644 (file)
@@ -28,7 +28,8 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory {
 
     bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
 
-    const UnicodeSet* getLeadCodePoints() const override;
+    /** NOTE: This method is not guaranteed to be thread-safe. */
+    const UnicodeSet& getLeadCodePoints() override;
 
     virtual bool isDisabled(const ParsedNumber& result) const = 0;
 
@@ -92,7 +93,7 @@ class NanMatcher : public SymbolMatcher {
 
     NanMatcher(const DecimalFormatSymbols& dfs);
 
-    const UnicodeSet* getLeadCodePoints() const override;
+    const UnicodeSet& getLeadCodePoints() override;
 
   protected:
     bool isDisabled(const ParsedNumber& result) const override;
index 30ad92d371366901269322c5a104c1206e68b11f..76aa75e0fcbfe0e3a3669443381b2f6e1e67eb04 100644 (file)
@@ -244,8 +244,6 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
  */
 class NumberParseMatcher {
   public:
-    virtual ~NumberParseMatcher() = default;
-
     /**
      * Matchers can override this method to return true to indicate that they are optional and can be run
      * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher.
@@ -259,6 +257,8 @@ class NumberParseMatcher {
      * something interesting in the StringSegment, it should update the offset of the StringSegment
      * corresponding to how many chars were matched.
      *
+     * This method is thread-safe.
+     *
      * @param segment
      *            The StringSegment to match against. Matches always start at the beginning of the
      *            segment. The segment is guaranteed to contain at least one char.
@@ -275,9 +275,12 @@ class NumberParseMatcher {
      * return value is used to skip this matcher unless a segment begins with a char in this set. To make
      * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
      *
-     * The returned UnicodeSet needs adoption!
+     * The returned UnicodeSet does not need adoption and is guaranteed to be alive for as long as the
+     * object that returned it.
+     *
+     * This method is NOT thread-safe.
      */
-    virtual const UnicodeSet* getLeadCodePoints() const = 0;
+    virtual const UnicodeSet& getLeadCodePoints() = 0;
 
     /**
      * Method called at the end of a parse, after all matchers have failed to consume any more chars.
@@ -290,6 +293,13 @@ class NumberParseMatcher {
     virtual void postProcess(ParsedNumber&) const {
         // Default implementation: no-op
     };
+
+  protected:
+    // No construction except by subclasses!
+    NumberParseMatcher() = default;
+
+    // Optional ownership of the leadCodePoints set
+    LocalPointer<const UnicodeSet> fLocalLeadCodePoints;
 };
 
 
index c7d67565b2379f9a036c4866daf3be0e6d384ce8..47b220afad7900ac1e6d2e40db45f1c72b39eadf 100644 (file)
@@ -238,6 +238,12 @@ UnicodeString toString(UBool b) {
   return b ? UnicodeString("TRUE"):UnicodeString("FALSE");
 }
 
+UnicodeString toString(const UnicodeSet& uniset, UErrorCode& status) {
+    UnicodeString result;
+    uniset.toPattern(result, status);
+    return result;
+}
+
 // stephen - cleaned up 05/05/99
 UnicodeString operator+(const UnicodeString& left, char num)
 { return left + (long)num; }
@@ -2050,6 +2056,24 @@ UBool IntlTest::assertEquals(const char* message,
     return TRUE;
 }
 
+UBool IntlTest::assertEquals(const char* message,
+                             const UnicodeSet& expected,
+                             const UnicodeSet& actual) {
+    IcuTestErrorCode status(*this, "assertEqualsUniSet");
+    if (expected != actual) {
+        errln((UnicodeString)"FAIL: " + message + "; got " +
+              toString(actual, status) +
+              "; expected " + toString(expected, status));
+        return FALSE;
+    }
+#ifdef VERBOSE_ASSERTIONS
+    else {
+        logln((UnicodeString)"Ok: " + message + "; got " + toString(actual, status));
+    }
+#endif
+    return TRUE;
+}
+
 
 #if !UCONFIG_NO_FORMATTING
 UBool IntlTest::assertEquals(const char* message,
@@ -2136,6 +2160,11 @@ UBool IntlTest::assertEquals(const UnicodeString& message,
                              UErrorCode actual) {
     return assertEquals(extractToAssertBuf(message), expected, actual);
 }
+UBool IntlTest::assertEquals(const UnicodeString& message,
+                             const UnicodeSet& expected,
+                             const UnicodeSet& actual) {
+    return assertEquals(extractToAssertBuf(message), expected, actual);
+}
 
 #if !UCONFIG_NO_FORMATTING
 UBool IntlTest::assertEquals(const UnicodeString& message,
index 08765b707d0ebed84291831dc023bfde055bbad6..5d4b661f392601f238e2cbfae9a64f855f004487 100644 (file)
@@ -16,6 +16,7 @@
 // The following includes utypes.h, uobject.h and unistr.h
 #include "unicode/fmtable.h"
 #include "unicode/testlog.h"
+#include "unicode/uniset.h"
 
 U_NAMESPACE_USE
 
@@ -295,6 +296,7 @@ public:
     UBool assertEquals(const char* message, int64_t expected, int64_t actual);
     UBool assertEquals(const char* message, double expected, double actual);
     UBool assertEquals(const char* message, UErrorCode expected, UErrorCode actual);
+    UBool assertEquals(const char* message, const UnicodeSet& expected, const UnicodeSet& actual);
 #if !UCONFIG_NO_FORMATTING
     UBool assertEquals(const char* message, const Formattable& expected,
                        const Formattable& actual, UBool possibleDataError=FALSE);
@@ -312,6 +314,7 @@ public:
     UBool assertEquals(const UnicodeString& message, int64_t expected, int64_t actual);
     UBool assertEquals(const UnicodeString& message, double expected, double actual);
     UBool assertEquals(const UnicodeString& message, UErrorCode expected, UErrorCode actual);
+    UBool assertEquals(const UnicodeString& message, const UnicodeSet& expected, const UnicodeSet& actual);
 
     virtual void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); // overide !
 
index 5da55bbe9c717e4543238d649827676a9705cb3d..945d76d9b32de50aaccbabe3f1750ee2b40204b2 100644 (file)
@@ -212,6 +212,7 @@ class NumberParserTest : public IntlTest {
     void testBasic();
     void testLocaleFi();
     void testSeriesMatcher();
+    void testCurrencyAnyMatcher();
     void testGroupingDisabled();
 
     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0);
index 4140320bc9cbdc56408840cd237e6dcf9f3b4f04..1dbf73a3d169f2e82b24a39d7c3e0a241df44560 100644 (file)
@@ -21,6 +21,7 @@ void NumberParserTest::runIndexedTest(int32_t index, UBool exec, const char*& na
     }
     TESTCASE_AUTO_BEGIN;
         TESTCASE_AUTO(testBasic);
+        TESTCASE_AUTO(testSeriesMatcher);
     TESTCASE_AUTO_END;
 }
 
@@ -99,7 +100,7 @@ void NumberParserTest::testBasic() {
                  {3, u"0", u"0", 1, 0.0}};
 
     parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
-    for (auto cas : cases) {
+    for (auto& cas : cases) {
         UnicodeString inputString(cas.inputString);
         UnicodeString patternString(cas.patternString);
         LocalPointer<const NumberParserImpl> parser(
@@ -153,5 +154,54 @@ void NumberParserTest::testBasic() {
     }
 }
 
+void NumberParserTest::testSeriesMatcher() {
+    IcuTestErrorCode status(*this, "testSeriesMatcher");
+
+    DecimalFormatSymbols symbols("en", status);
+
+    PlusSignMatcher m0(symbols, false);
+    MinusSignMatcher m1(symbols, false);
+    IgnorablesMatcher m2(unisets::DEFAULT_IGNORABLES);
+    PercentMatcher m3(symbols);
+    IgnorablesMatcher m4(unisets::DEFAULT_IGNORABLES);
+
+    ArraySeriesMatcher series(new NumberParseMatcher* [5]{&m0, &m1, &m2, &m3, &m4}, 5);
+
+    assertEquals(
+            "Lead set should be equal to lead set of lead matcher",
+            *unisets::get(unisets::PLUS_SIGN),
+            series.getLeadCodePoints());
+
+    static const struct TestCase {
+        const char16_t* input;
+        int32_t expectedOffset;
+        bool expectedMaybeMore;
+    } cases[] = {{u"", 0, true},
+                 {u" ", 0, false},
+                 {u"$", 0, false},
+                 {u"+", 0, true},
+                 {u" +", 0, false},
+                 {u"+-", 0, true},
+                 {u"+ -", 0, false},
+                 {u"+-  ", 0, true},
+                 {u"+-  $", 0, false},
+                 {u"+-%", 3, true},
+                 {u"  +-  %  ", 0, false},
+                 {u"+-  %  ", 7, true},
+                 {u"+-%$", 3, false}};
+
+    for (auto& cas : cases) {
+        UnicodeString input(cas.input);
+
+        StringSegment segment(input, 0);
+        ParsedNumber result;
+        bool actualMaybeMore = series.match(segment, result, status);
+        int actualOffset = segment.getOffset();
+
+        assertEquals("'" + input + "'", cas.expectedOffset, actualOffset);
+        assertEquals("'" + input + "'", cas.expectedMaybeMore, actualMaybeMore);
+    }
+}
+
 
 #endif
similarity index 86%
rename from icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java
rename to icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java
index d81c2e9f81a5e4ee1fcb44e68472526cb8963c01..3df201889b3d6e4f5adddedae94529eea09cbfe0 100644 (file)
@@ -9,19 +9,19 @@ import com.ibm.icu.util.ULocale;
 /**
  * A matcher for a single currency instance (not the full trie).
  */
-public class CurrencyMatcher implements NumberParseMatcher {
+public class CurrencyCustomMatcher implements NumberParseMatcher {
 
     private final String isoCode;
     private final String currency1;
     private final String currency2;
 
-    public static CurrencyMatcher getInstance(Currency currency, ULocale loc) {
-        return new CurrencyMatcher(currency.getSubtype(),
+    public static CurrencyCustomMatcher getInstance(Currency currency, ULocale loc) {
+        return new CurrencyCustomMatcher(currency.getSubtype(),
                 currency.getSymbol(loc),
                 currency.getCurrencyCode());
     }
 
-    private CurrencyMatcher(String isoCode, String currency1, String currency2) {
+    private CurrencyCustomMatcher(String isoCode, String currency1, String currency2) {
         this.isoCode = isoCode;
         this.currency1 = currency1;
         this.currency2 = currency2;
similarity index 77%
rename from icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java
rename to icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java
index 239949ec13db525e61d11fa6bd7a7ff416832ff4..9fdef2250456edb02d7ac0856470167c81e66c85 100644 (file)
@@ -11,21 +11,24 @@ import com.ibm.icu.util.Currency.CurrencyStringInfo;
 import com.ibm.icu.util.ULocale;
 
 /**
- * @author sffc
+ * Matches currencies according to all available strings in locale data.
+ *
+ * The implementation of this class is different between J and C. See #13584 for a follow-up.
  *
+ * @author sffc
  */
-public class CurrencyTrieMatcher implements NumberParseMatcher {
+public class CurrencyNamesMatcher implements NumberParseMatcher {
 
     private final TextTrieMap<CurrencyStringInfo> longNameTrie;
     private final TextTrieMap<CurrencyStringInfo> symbolTrie;
 
-    public static CurrencyTrieMatcher getInstance(ULocale locale) {
+    public static CurrencyNamesMatcher getInstance(ULocale locale) {
         // TODO: Pre-compute some of the more popular locales?
-        return new CurrencyTrieMatcher(locale);
+        return new CurrencyNamesMatcher(locale);
     }
 
-    private CurrencyTrieMatcher(ULocale locale) {
-        // TODO: Currency trie does not currently have an option for case folding.  It defaults to use
+    private CurrencyNamesMatcher(ULocale locale) {
+        // TODO: Currency trie does not currently have an option for case folding. It defaults to use
         // case folding on long-names but not symbols.
         longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
         symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
@@ -55,6 +58,8 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
         UnicodeSet leadCodePoints = new UnicodeSet();
         longNameTrie.putLeadCodePoints(leadCodePoints);
         symbolTrie.putLeadCodePoints(leadCodePoints);
+        // Always apply case mapping closure for currencies
+        leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
         return leadCodePoints.freeze();
     }
 
index d5640d4aadb08d60c23defe429cc862a3a88c3d1..63c37b916edc4758924b26235ed7c722a2d3f170 100644 (file)
@@ -7,14 +7,15 @@ import com.ibm.icu.util.Currency;
 import com.ibm.icu.util.ULocale;
 
 /**
- * @author sffc
+ * Small helper class that generates matchers for SeriesMatcher.
  *
+ * @author sffc
  */
 public class MatcherFactory {
-    Currency currency;
-    DecimalFormatSymbols symbols;
-    IgnorablesMatcher ignorables;
-    ULocale locale;
+    public Currency currency;
+    public DecimalFormatSymbols symbols;
+    public IgnorablesMatcher ignorables;
+    public ULocale locale;
 
     public MinusSignMatcher minusSign(boolean allowTrailing) {
         return MinusSignMatcher.getInstance(symbols, allowTrailing);
@@ -34,8 +35,8 @@ public class MatcherFactory {
 
     public AnyMatcher currency() {
         AnyMatcher any = new AnyMatcher();
-        any.addMatcher(CurrencyMatcher.getInstance(currency, locale));
-        any.addMatcher(CurrencyTrieMatcher.getInstance(locale));
+        any.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale));
+        any.addMatcher(CurrencyNamesMatcher.getInstance(locale));
         any.freeze();
         return any;
     }
index 5060b9518d39a62247f2d1354763010dacc6c6ee..ed75d2d514e5eca924b80869ebed89a1e6f96a3d 100644 (file)
@@ -95,7 +95,7 @@ public class NumberParserImpl {
         parser.addMatcher(InfinityMatcher.getInstance(symbols));
         parser.addMatcher(PaddingMatcher.getInstance("@"));
         parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper));
-        parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
+        parser.addMatcher(CurrencyNamesMatcher.getInstance(locale));
         parser.addMatcher(new RequireNumberMatcher());
 
         parser.freeze();
@@ -213,8 +213,8 @@ public class NumberParserImpl {
         ////////////////////////
 
         if (parseCurrency || patternInfo.hasCurrencySign()) {
-            parser.addMatcher(CurrencyMatcher.getInstance(currency, locale));
-            parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
+            parser.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale));
+            parser.addMatcher(CurrencyNamesMatcher.getInstance(locale));
         }
 
         ///////////////////////////////
index 912529479a513e41631c123765922a7100e708a1..41312d8399a506880fc43556165fb2ceb5ccf9ef 100644 (file)
@@ -7,8 +7,11 @@ import static org.junit.Assert.assertTrue;
 
 import org.junit.Test;
 
+import com.ibm.icu.impl.number.CustomSymbolCurrency;
 import com.ibm.icu.impl.number.DecimalFormatProperties;
+import com.ibm.icu.impl.number.parse.AnyMatcher;
 import com.ibm.icu.impl.number.parse.IgnorablesMatcher;
+import com.ibm.icu.impl.number.parse.MatcherFactory;
 import com.ibm.icu.impl.number.parse.MinusSignMatcher;
 import com.ibm.icu.impl.number.parse.NumberParserImpl;
 import com.ibm.icu.impl.number.parse.ParsedNumber;
@@ -222,6 +225,38 @@ public class NumberParserTest {
         }
     }
 
+    @Test
+    public void testCurrencyAnyMatcher() {
+        MatcherFactory factory = new MatcherFactory();
+        factory.locale = ULocale.ENGLISH;
+        CustomSymbolCurrency currency = new CustomSymbolCurrency("ICU", "IU$", "ICU");
+        factory.currency = currency;
+        AnyMatcher matcher = factory.currency();
+
+        Object[][] cases = new Object[][] {
+                { "", null },
+                { "FOO", null },
+                { "USD", "USD" },
+                { "$", "USD" },
+                { "US dollars", "USD" },
+                { "eu", null },
+                { "euros", "EUR" },
+                { "ICU", "ICU" },
+                { "IU$", "ICU" } };
+        for (Object[] cas : cases) {
+            String input = (String) cas[0];
+            String expectedCurrencyCode = (String) cas[1];
+
+            StringSegment segment = new StringSegment(input, 0);
+            ParsedNumber result = new ParsedNumber();
+            matcher.match(segment, result);
+            assertEquals("Parsing " + input, expectedCurrencyCode, result.currencyCode);
+            assertEquals("Whole string on " + input,
+                    expectedCurrencyCode == null ? 0 : input.length(),
+                    result.charEnd);
+        }
+    }
+
     @Test
     public void testGroupingDisabled() {
         DecimalFormatProperties properties = new DecimalFormatProperties();