From: Shane Carr <shane@unicode.org>
Date: Fri, 9 Feb 2018 06:57:37 +0000 (+0000)
Subject: ICU-13574 Adding scientific matcher to ICU4C.
X-Git-Tag: release-62-rc~200^2~128
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e91ff603debb71799d5182fccfd9358326fb8b4d;p=icu

ICU-13574 Adding scientific matcher to ICU4C.

X-SVN-Rev: 40880
---

diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in
index 94dac4e235b..75c5565d4de 100644
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -109,7 +109,7 @@ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \
 number_padding.o number_patternmodifier.o number_patternstring.o \
 number_rounding.o number_scientific.o number_stringbuilder.o \
 numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \
-numparse_impl.o numparse_symbols.o numparse_decimal.o
+numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o
 
 
 ## Header files to install
diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp
index 575e0e16799..68707439fa2 100644
--- a/icu4c/source/i18n/numparse_impl.cpp
+++ b/icu4c/source/i18n/numparse_impl.cpp
@@ -57,7 +57,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
     parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
     parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
     parser->addMatcher(parser->fLocalMatchers.padding = {u"@"});
-//    parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
+    parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
 //    parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
 //    parser.addMatcher(new RequireNumberMatcher());
 
diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h
index 3f9b5d4b355..4745bf152a8 100644
--- a/icu4c/source/i18n/numparse_impl.h
+++ b/icu4c/source/i18n/numparse_impl.h
@@ -10,6 +10,7 @@
 #include "numparse_types.h"
 #include "numparse_decimal.h"
 #include "numparse_symbols.h"
+#include "numparse_scientific.h"
 #include "unicode/uniset.h"
 
 U_NAMESPACE_BEGIN namespace numparse {
@@ -54,6 +55,7 @@ class NumberParserImpl {
         PermilleMatcher permille;
         PlusSignMatcher plusSign;
         DecimalMatcher decimal;
+        ScientificMatcher scientific;
     } fLocalMatchers;
 
     NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
diff --git a/icu4c/source/i18n/numparse_scientific.cpp b/icu4c/source/i18n/numparse_scientific.cpp
new file mode 100644
index 00000000000..3b69dcdc997
--- /dev/null
+++ b/icu4c/source/i18n/numparse_scientific.cpp
@@ -0,0 +1,84 @@
+// Â© 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_types.h"
+#include "numparse_scientific.h"
+#include "numparse_unisets.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+
+
+ScientificMatcher::ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper)
+        : fExponentSeparatorString(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol)),
+          fExponentMatcher(dfs, grouper, PARSE_FLAG_INTEGER_ONLY) {
+}
+
+bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
+    // Only accept scientific notation after the mantissa.
+    // Most places use result.hasNumber(), but we need a stronger condition here (i.e., exponent is
+    // not well-defined after NaN or infinity).
+    if (result.quantity.bogus) {
+        return false;
+    }
+
+    // First match the scientific separator, and then match another number after it.
+    int overlap1 = segment.getCommonPrefixLength(fExponentSeparatorString);
+    if (overlap1 == fExponentSeparatorString.length()) {
+        // Full exponent separator match.
+
+        // First attempt to get a code point, returning true if we can't get one.
+        segment.adjustOffset(overlap1);
+        if (segment.length() == 0) {
+            return true;
+        }
+
+        // Allow a sign, and then try to match digits.
+        int8_t exponentSign = 1;
+        if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) {
+            exponentSign = -1;
+            segment.adjustOffsetByCodePoint();
+        } else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) {
+            segment.adjustOffsetByCodePoint();
+        }
+
+        int digitsOffset = segment.getOffset();
+        bool digitsReturnValue = fExponentMatcher.match(segment, result, exponentSign, status);
+        if (segment.getOffset() != digitsOffset) {
+            // At least one exponent digit was matched.
+            result.flags |= FLAG_HAS_EXPONENT;
+        } else {
+            // No exponent digits were matched; un-match the exponent separator.
+            segment.adjustOffset(-overlap1);
+        }
+        return digitsReturnValue;
+
+    } else if (overlap1 == segment.length()) {
+        // Partial exponent separator match
+        return true;
+    }
+
+    // No match
+    return false;
+}
+
+const UnicodeSet* ScientificMatcher::getLeadCodePoints() const {
+    UChar32 leadCp = fExponentSeparatorString.char32At(0);
+    const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
+    if (s->contains(leadCp)) {
+        return new UnicodeSet(*s);
+    } else {
+        UnicodeSet* leadCodePoints = new UnicodeSet();
+        leadCodePoints->add(leadCp);
+        leadCodePoints->freeze();
+        return leadCodePoints;
+    }
+}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_scientific.h b/icu4c/source/i18n/numparse_scientific.h
new file mode 100644
index 00000000000..544386c7c39
--- /dev/null
+++ b/icu4c/source/i18n/numparse_scientific.h
@@ -0,0 +1,41 @@
+// Â© 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_SCIENTIFIC_H__
+#define __NUMPARSE_SCIENTIFIC_H__
+
+#include "numparse_types.h"
+#include "numparse_decimal.h"
+#include "unicode/numberformatter.h"
+
+using icu::number::impl::Grouper;
+
+U_NAMESPACE_BEGIN namespace numparse {
+namespace impl {
+
+
+class ScientificMatcher : public NumberParseMatcher, public UMemory {
+  public:
+    ScientificMatcher() = default;  // WARNING: Leaves the object in an unusable state
+
+    ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper);
+
+    bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+
+    const UnicodeSet* getLeadCodePoints() const override;
+
+  private:
+    UnicodeString fExponentSeparatorString;
+    DecimalMatcher fExponentMatcher;
+};
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_SCIENTIFIC_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp
index 018296a2c44..76e193a04be 100644
--- a/icu4c/source/test/intltest/numbertest_parse.cpp
+++ b/icu4c/source/test/intltest/numbertest_parse.cpp
@@ -85,9 +85,9 @@ void NumberParserTest::testBasic() {
 //                 {3, u"{ð±ð­ð°ð®ð¯}", u"{0};{0}", 12, 51423.},
 //                 {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
 //                 {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
-//                 {3, u"ð±.ð­ð°ð®Eð¯", u"0", 12, 5142.},
-//                 {3, u"ð±.ð­ð°ð®E-ð¯", u"0", 13, 0.005142},
-//                 {3, u"ð±.ð­ð°ð®e-ð¯", u"0", 13, 0.005142},
+                 {3, u"ð±.ð­ð°ð®Eð¯", u"0", 12, 5142.},
+                 {3, u"ð±.ð­ð°ð®E-ð¯", u"0", 13, 0.005142},
+                 {3, u"ð±.ð­ð°ð®e-ð¯", u"0", 13, 0.005142},
 //                 {7, u"5,142.50 Canadian dollars", u"#,##,##0 Â¤Â¤Â¤", 25, 5142.5},
 //                 {3, u"a$ b5", u"a Â¤ b0", 5, 5.0},
 //                 {3, u"ðº1.23", u"ðº0;ð»0", 6, 1.23},