From: Shane Carr Date: Fri, 9 Feb 2018 06:57:37 +0000 (+0000) Subject: ICU-13574 Adding scientific matcher to ICU4C. X-Git-Tag: release-62-rc~200^2~128 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e91ff603debb71799d5182fccfd9358326fb8b4d;p=icu ICU-13574 Adding scientific matcher to ICU4C. X-SVN-Rev: 40880 --- diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 94dac4e235b..75c5565d4de 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -109,7 +109,7 @@ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \ number_padding.o number_patternmodifier.o number_patternstring.o \ number_rounding.o number_scientific.o number_stringbuilder.o \ numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \ -numparse_impl.o numparse_symbols.o numparse_decimal.o +numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o ## Header files to install diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 575e0e16799..68707439fa2 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -57,7 +57,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); -// parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags)); + parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); // parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); // parser.addMatcher(new RequireNumberMatcher()); diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index 3f9b5d4b355..4745bf152a8 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -10,6 +10,7 @@ #include "numparse_types.h" #include "numparse_decimal.h" #include "numparse_symbols.h" +#include "numparse_scientific.h" #include "unicode/uniset.h" U_NAMESPACE_BEGIN namespace numparse { @@ -54,6 +55,7 @@ class NumberParserImpl { PermilleMatcher permille; PlusSignMatcher plusSign; DecimalMatcher decimal; + ScientificMatcher scientific; } fLocalMatchers; NumberParserImpl(parse_flags_t parseFlags, bool computeLeads); diff --git a/icu4c/source/i18n/numparse_scientific.cpp b/icu4c/source/i18n/numparse_scientific.cpp new file mode 100644 index 00000000000..3b69dcdc997 --- /dev/null +++ b/icu4c/source/i18n/numparse_scientific.cpp @@ -0,0 +1,84 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numparse_types.h" +#include "numparse_scientific.h" +#include "numparse_unisets.h" + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; + + +ScientificMatcher::ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper) + : fExponentSeparatorString(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol)), + fExponentMatcher(dfs, grouper, PARSE_FLAG_INTEGER_ONLY) { +} + +bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { + // Only accept scientific notation after the mantissa. + // Most places use result.hasNumber(), but we need a stronger condition here (i.e., exponent is + // not well-defined after NaN or infinity). + if (result.quantity.bogus) { + return false; + } + + // First match the scientific separator, and then match another number after it. + int overlap1 = segment.getCommonPrefixLength(fExponentSeparatorString); + if (overlap1 == fExponentSeparatorString.length()) { + // Full exponent separator match. + + // First attempt to get a code point, returning true if we can't get one. + segment.adjustOffset(overlap1); + if (segment.length() == 0) { + return true; + } + + // Allow a sign, and then try to match digits. + int8_t exponentSign = 1; + if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) { + exponentSign = -1; + segment.adjustOffsetByCodePoint(); + } else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) { + segment.adjustOffsetByCodePoint(); + } + + int digitsOffset = segment.getOffset(); + bool digitsReturnValue = fExponentMatcher.match(segment, result, exponentSign, status); + if (segment.getOffset() != digitsOffset) { + // At least one exponent digit was matched. + result.flags |= FLAG_HAS_EXPONENT; + } else { + // No exponent digits were matched; un-match the exponent separator. + segment.adjustOffset(-overlap1); + } + return digitsReturnValue; + + } else if (overlap1 == segment.length()) { + // Partial exponent separator match + return true; + } + + // No match + return false; +} + +const UnicodeSet* ScientificMatcher::getLeadCodePoints() const { + UChar32 leadCp = fExponentSeparatorString.char32At(0); + const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD); + if (s->contains(leadCp)) { + return new UnicodeSet(*s); + } else { + UnicodeSet* leadCodePoints = new UnicodeSet(); + leadCodePoints->add(leadCp); + leadCodePoints->freeze(); + return leadCodePoints; + } +} + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_scientific.h b/icu4c/source/i18n/numparse_scientific.h new file mode 100644 index 00000000000..544386c7c39 --- /dev/null +++ b/icu4c/source/i18n/numparse_scientific.h @@ -0,0 +1,41 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +#ifndef __NUMPARSE_SCIENTIFIC_H__ +#define __NUMPARSE_SCIENTIFIC_H__ + +#include "numparse_types.h" +#include "numparse_decimal.h" +#include "unicode/numberformatter.h" + +using icu::number::impl::Grouper; + +U_NAMESPACE_BEGIN namespace numparse { +namespace impl { + + +class ScientificMatcher : public NumberParseMatcher, public UMemory { + public: + ScientificMatcher() = default; // WARNING: Leaves the object in an unusable state + + ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper); + + bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; + + const UnicodeSet* getLeadCodePoints() const override; + + private: + UnicodeString fExponentSeparatorString; + DecimalMatcher fExponentMatcher; +}; + + +} // namespace impl +} // namespace numparse +U_NAMESPACE_END + +#endif //__NUMPARSE_SCIENTIFIC_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 018296a2c44..76e193a04be 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -85,9 +85,9 @@ void NumberParserTest::testBasic() { // {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.}, // {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number // {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b" -// {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.}, -// {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142}, -// {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142}, + {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.}, + {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142}, + {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142}, // {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5}, // {3, u"a$ b5", u"a ¤ b0", 5, 5.0}, // {3, u"📺1.23", u"📺0;📻0", 6, 1.23},