ICU-13574 Adding scientific matcher to ICU4C.

author Shane Carr <shane@unicode.org>

Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)

committer Shane Carr <shane@unicode.org>

Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)
author Shane Carr <shane@unicode.org>
Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)
committer Shane Carr <shane@unicode.org>
Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)
diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in

index 94dac4e235bca7533785f580660f043bf248eced..75c5565d4deced348a0b7cdcd0121f52789f757e 100644 (file)
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -109,7 +109,7 @@ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \
  number_padding.o number_patternmodifier.o number_patternstring.o \
  number_rounding.o number_scientific.o number_stringbuilder.o \
  numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \
-numparse_impl.o numparse_symbols.o numparse_decimal.o
+numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o
  
  
  ## Header files to install
diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp

index 575e0e16799e698b83de7c2591e97d814be82953..68707439fa2233a30a2cb63d0dd881e70c324b4c 100644 (file)
--- a/icu4c/source/i18n/numparse_impl.cpp
+++ b/icu4c/source/i18n/numparse_impl.cpp
@@ -57,7 +57,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
      parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
      parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
      parser->addMatcher(parser->fLocalMatchers.padding = {u"@"});
-//    parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
+    parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
  //    parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
  //    parser.addMatcher(new RequireNumberMatcher());
  
diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h

index 3f9b5d4b355179622590b2884875deac97c08630..4745bf152a8f56e382c54d1b90ba04694b368166 100644 (file)
--- a/icu4c/source/i18n/numparse_impl.h
+++ b/icu4c/source/i18n/numparse_impl.h
@@ -10,6 +10,7 @@
  #include "numparse_types.h"
  #include "numparse_decimal.h"
  #include "numparse_symbols.h"
+#include "numparse_scientific.h"
  #include "unicode/uniset.h"
  
  U_NAMESPACE_BEGIN namespace numparse {
@@ -54,6 +55,7 @@ class NumberParserImpl {
          PermilleMatcher permille;
          PlusSignMatcher plusSign;
          DecimalMatcher decimal;
+        ScientificMatcher scientific;
      } fLocalMatchers;
  
      NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
diff --git a/icu4c/source/i18n/numparse_scientific.cpp b/icu4c/source/i18n/numparse_scientific.cpp

new file mode 100644 (file)

index 0000000..3b69dcd
--- /dev/null
+++ b/icu4c/source/i18n/numparse_scientific.cpp
@@ -0,0 +1,84 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_types.h"
+#include "numparse_scientific.h"
+#include "numparse_unisets.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+
+
+ScientificMatcher::ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper)
+        : fExponentSeparatorString(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol)),
+          fExponentMatcher(dfs, grouper, PARSE_FLAG_INTEGER_ONLY) {
+}
+
+bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
+    // Only accept scientific notation after the mantissa.
+    // Most places use result.hasNumber(), but we need a stronger condition here (i.e., exponent is
+    // not well-defined after NaN or infinity).
+    if (result.quantity.bogus) {
+        return false;
+    }
+
+    // First match the scientific separator, and then match another number after it.
+    int overlap1 = segment.getCommonPrefixLength(fExponentSeparatorString);
+    if (overlap1 == fExponentSeparatorString.length()) {
+        // Full exponent separator match.
+
+        // First attempt to get a code point, returning true if we can't get one.
+        segment.adjustOffset(overlap1);
+        if (segment.length() == 0) {
+            return true;
+        }
+
+        // Allow a sign, and then try to match digits.
+        int8_t exponentSign = 1;
+        if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) {
+            exponentSign = -1;
+            segment.adjustOffsetByCodePoint();
+        } else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) {
+            segment.adjustOffsetByCodePoint();
+        }
+
+        int digitsOffset = segment.getOffset();
+        bool digitsReturnValue = fExponentMatcher.match(segment, result, exponentSign, status);
+        if (segment.getOffset() != digitsOffset) {
+            // At least one exponent digit was matched.
+            result.flags |= FLAG_HAS_EXPONENT;
+        } else {
+            // No exponent digits were matched; un-match the exponent separator.
+            segment.adjustOffset(-overlap1);
+        }
+        return digitsReturnValue;
+
+    } else if (overlap1 == segment.length()) {
+        // Partial exponent separator match
+        return true;
+    }
+
+    // No match
+    return false;
+}
+
+const UnicodeSet* ScientificMatcher::getLeadCodePoints() const {
+    UChar32 leadCp = fExponentSeparatorString.char32At(0);
+    const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
+    if (s->contains(leadCp)) {
+        return new UnicodeSet(*s);
+    } else {
+        UnicodeSet* leadCodePoints = new UnicodeSet();
+        leadCodePoints->add(leadCp);
+        leadCodePoints->freeze();
+        return leadCodePoints;
+    }
+}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_scientific.h b/icu4c/source/i18n/numparse_scientific.h

new file mode 100644 (file)

index 0000000..544386c
--- /dev/null
+++ b/icu4c/source/i18n/numparse_scientific.h
@@ -0,0 +1,41 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_SCIENTIFIC_H__
+#define __NUMPARSE_SCIENTIFIC_H__
+
+#include "numparse_types.h"
+#include "numparse_decimal.h"
+#include "unicode/numberformatter.h"
+
+using icu::number::impl::Grouper;
+
+U_NAMESPACE_BEGIN namespace numparse {
+namespace impl {
+
+
+class ScientificMatcher : public NumberParseMatcher, public UMemory {
+  public:
+    ScientificMatcher() = default;  // WARNING: Leaves the object in an unusable state
+
+    ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper);
+
+    bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+
+    const UnicodeSet* getLeadCodePoints() const override;
+
+  private:
+    UnicodeString fExponentSeparatorString;
+    DecimalMatcher fExponentMatcher;
+};
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_SCIENTIFIC_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp

index 018296a2c447aacb2d62b87787475cba3c5b4e1d..76e193a04bec137b2a372e672f3f705503945188 100644 (file)
--- a/icu4c/source/test/intltest/numbertest_parse.cpp
+++ b/icu4c/source/test/intltest/numbertest_parse.cpp
@@ -85,9 +85,9 @@ void NumberParserTest::testBasic() {
  //                 {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.},
  //                 {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
  //                 {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
-//                 {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
-//                 {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
-//                 {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
+                 {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
+                 {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
+                 {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
  //                 {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
  //                 {3, u"a$ b5", u"a ¤ b0", 5, 5.0},
  //                 {3, u"📺1.23", u"📺0;📻0", 6, 1.23},
author	Shane Carr <shane@unicode.org>
	Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)
committer	Shane Carr <shane@unicode.org>
	Fri, 9 Feb 2018 06:57:37 +0000 (06:57 +0000)
icu4c/source/i18n/Makefile.in		patch \| blob \| history
icu4c/source/i18n/numparse_impl.cpp		patch \| blob \| history
icu4c/source/i18n/numparse_impl.h		patch \| blob \| history
icu4c/source/i18n/numparse_scientific.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/numparse_scientific.h	[new file with mode: 0644]	patch \| blob
icu4c/source/test/intltest/numbertest_parse.cpp		patch \| blob \| history