From b8bab89cb5fae5de161dabe06bdce9c7b6aebda2 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Fri, 9 Feb 2018 06:30:40 +0000 Subject: [PATCH] ICU-13574 Implementing final two SymbolMatchers in ICU4C (infinity and padding). X-SVN-Rev: 40878 --- icu4c/source/i18n/numparse_impl.cpp | 5 +++ icu4c/source/i18n/numparse_impl.h | 2 + icu4c/source/i18n/numparse_symbols.cpp | 30 +++++++++++++ icu4c/source/i18n/numparse_symbols.h | 33 +++++++++++++++ icu4c/source/i18n/numparse_unisets.cpp | 42 ++++++++++--------- .../source/test/intltest/numbertest_parse.cpp | 4 ++ .../impl/number/parse/InfinityMatcher.java | 2 +- .../impl/number/parse/NumberParserImpl.java | 2 + .../icu/impl/number/parse/SymbolMatcher.java | 3 +- .../icu/dev/test/number/NumberParserTest.java | 4 ++ 10 files changed, 105 insertions(+), 22 deletions(-) diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 1df9c56b43b..575e0e16799 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -5,6 +5,9 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file +#define UNISTR_FROM_STRING_EXPLICIT + #include "number_types.h" #include "number_patternstring.h" #include "numparse_types.h" @@ -52,6 +55,8 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); + parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); + parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); // parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags)); // parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); // parser.addMatcher(new RequireNumberMatcher()); diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index 105f8c71abe..3f9b5d4b355 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -46,8 +46,10 @@ class NumberParserImpl { // You must use an assignment operator on them before using. struct { IgnorablesMatcher ignorables; + InfinityMatcher infinity; MinusSignMatcher minusSign; NanMatcher nan; + PaddingMatcher padding; PercentMatcher percent; PermilleMatcher permille; PlusSignMatcher plusSign; diff --git a/icu4c/source/i18n/numparse_symbols.cpp b/icu4c/source/i18n/numparse_symbols.cpp index 5fabd2fb17f..8e192cf7736 100644 --- a/icu4c/source/i18n/numparse_symbols.cpp +++ b/icu4c/source/i18n/numparse_symbols.cpp @@ -85,6 +85,20 @@ void IgnorablesMatcher::accept(StringSegment&, ParsedNumber&) const { } +InfinityMatcher::InfinityMatcher(const DecimalFormatSymbols& dfs) + : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::INFINITY) { +} + +bool InfinityMatcher::isDisabled(const ParsedNumber& result) const { + return 0 != (result.flags & FLAG_INFINITY); +} + +void InfinityMatcher::accept(StringSegment& segment, ParsedNumber& result) const { + result.flags |= FLAG_INFINITY; + result.setCharsConsumed(segment); +} + + MinusSignMatcher::MinusSignMatcher(const DecimalFormatSymbols& dfs, bool allowTrailing) : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol), unisets::MINUS_SIGN), fAllowTrailing(allowTrailing) { @@ -125,6 +139,22 @@ void NanMatcher::accept(StringSegment& segment, ParsedNumber& result) const { } +PaddingMatcher::PaddingMatcher(const UnicodeString& padString) + : SymbolMatcher(padString, unisets::EMPTY) {} + +bool PaddingMatcher::isFlexible() const { + return true; +} + +bool PaddingMatcher::isDisabled(const ParsedNumber& result) const { + return false; +} + +void PaddingMatcher::accept(StringSegment& segment, ParsedNumber& result) const { + // No-op +} + + PercentMatcher::PercentMatcher(const DecimalFormatSymbols& dfs) : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol), unisets::PERCENT_SIGN) { } diff --git a/icu4c/source/i18n/numparse_symbols.h b/icu4c/source/i18n/numparse_symbols.h index c8ba913911f..40a57f02baf 100644 --- a/icu4c/source/i18n/numparse_symbols.h +++ b/icu4c/source/i18n/numparse_symbols.h @@ -15,6 +15,11 @@ U_NAMESPACE_BEGIN namespace numparse { namespace impl { +/** + * A base class for many matchers that performs a simple match against a UnicodeString and/or UnicodeSet. + * + * @author sffc + */ class SymbolMatcher : public NumberParseMatcher, public UMemory { public: SymbolMatcher() = default; // WARNING: Leaves the object in an unusable state @@ -52,6 +57,19 @@ class IgnorablesMatcher : public SymbolMatcher { }; +class InfinityMatcher : public SymbolMatcher { + public: + InfinityMatcher() = default; // WARNING: Leaves the object in an unusable state + + InfinityMatcher(const DecimalFormatSymbols& dfs); + + protected: + bool isDisabled(const ParsedNumber& result) const override; + + void accept(StringSegment& segment, ParsedNumber& result) const override; +}; + + class MinusSignMatcher : public SymbolMatcher { public: MinusSignMatcher() = default; // WARNING: Leaves the object in an unusable state @@ -83,6 +101,21 @@ class NanMatcher : public SymbolMatcher { }; +class PaddingMatcher : public SymbolMatcher { + public: + PaddingMatcher() = default; // WARNING: Leaves the object in an unusable state + + PaddingMatcher(const UnicodeString& padString); + + bool isFlexible() const override; + + protected: + bool isDisabled(const ParsedNumber& result) const override; + + void accept(StringSegment& segment, ParsedNumber& result) const override; +}; + + class PercentMatcher : public SymbolMatcher { public: PercentMatcher() = default; // WARNING: Leaves the object in an unusable state diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp index f259f7a6467..625e1ac31dc 100644 --- a/icu4c/source/i18n/numparse_unisets.cpp +++ b/icu4c/source/i18n/numparse_unisets.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file +// (useful for UnicodeSet constructor) +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_unisets.h" #include "numparse_types.h" #include "umutex.h" @@ -56,44 +60,42 @@ UBool U_CALLCONV cleanupNumberParseUniSets() { void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUniSets); -#define NEW_UNISET(pattern, status) new UnicodeSet(UnicodeString(pattern), status) gUnicodeSets[EMPTY] = new UnicodeSet(); // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. - gUnicodeSets[BIDI] = NEW_UNISET(u"[[\\u200E\\u200F\\u061C]]", status); + gUnicodeSets[BIDI] = new UnicodeSet(u"[[\\u200E\\u200F\\u061C]]", status); // This set was decided after discussion with icu-design@. See ticket #13309. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). - gUnicodeSets[WHITESPACE] = NEW_UNISET(u"[[:Zs:][\\u0009]]", status); + gUnicodeSets[WHITESPACE] = new UnicodeSet(u"[[:Zs:][\\u0009]]", status); gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE); gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(*gUnicodeSets[BIDI]); // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while. - gUnicodeSets[COMMA] = NEW_UNISET(u"[,،٫、︐︑﹐﹑,、]", status); - gUnicodeSets[STRICT_COMMA] = NEW_UNISET(u"[,٫︐﹐,]", status); - gUnicodeSets[PERIOD] = NEW_UNISET(u"[.․。︒﹒.。]", status); - gUnicodeSets[STRICT_PERIOD] = NEW_UNISET(u"[.․﹒.。]", status); - gUnicodeSets[OTHER_GROUPING_SEPARATORS] = NEW_UNISET( - u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", - status); + gUnicodeSets[COMMA] = new UnicodeSet(u"[,،٫、︐︑﹐﹑,、]", status); + gUnicodeSets[STRICT_COMMA] = new UnicodeSet(u"[,٫︐﹐,]", status); + gUnicodeSets[PERIOD] = new UnicodeSet(u"[.․。︒﹒.。]", status); + gUnicodeSets[STRICT_PERIOD] = new UnicodeSet(u"[.․﹒.。]", status); + gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet( + u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status); gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); - gUnicodeSets[MINUS_SIGN] = NEW_UNISET(u"[-⁻₋−➖﹣-]", status); - gUnicodeSets[PLUS_SIGN] = NEW_UNISET(u"[+⁺₊➕﬩﹢+]", status); + gUnicodeSets[MINUS_SIGN] = new UnicodeSet(u"[-⁻₋−➖﹣-]", status); + gUnicodeSets[PLUS_SIGN] = new UnicodeSet(u"[+⁺₊➕﬩﹢+]", status); - gUnicodeSets[PERCENT_SIGN] = NEW_UNISET(u"[%٪]", status); - gUnicodeSets[PERMILLE_SIGN] = NEW_UNISET(u"[‰؉]", status); - gUnicodeSets[INFINITY] = NEW_UNISET(u"[∞]", status); + gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status); + gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status); + gUnicodeSets[INFINITY] = new UnicodeSet(u"[∞]", status); - gUnicodeSets[DIGITS] = NEW_UNISET(u"[:digit:]", status); - gUnicodeSets[NAN_LEAD] = NEW_UNISET(u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", - status); - gUnicodeSets[SCIENTIFIC_LEAD] = NEW_UNISET(u"[Ee×·е\u0627]", status); - gUnicodeSets[CWCF] = NEW_UNISET(u"[:CWCF:]", status); + gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); + gUnicodeSets[NAN_LEAD] = new UnicodeSet( + u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status); + gUnicodeSets[SCIENTIFIC_LEAD] = new UnicodeSet(u"[Ee×·е\u0627]", status); + gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status); gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 4e091048d84..018296a2c44 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -61,6 +61,10 @@ void NumberParserTest::testBasic() { {3, u"‰51423", u"0", 6, 51.423}, {3, u"51423‰", u"0", 6, 51.423}, {3, u"51423‰‰", u"0", 6, 51.423}, + {3, u"∞", u"0", 1, INFINITY}, + {3, u"-∞", u"0", 2, -INFINITY}, + {3, u"@@@123 @@", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak? + {3, u"@@@123@@ ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak? // {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.}, // {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.}, // {3, u"514.23 USD", u"¤0", 10, 514.23}, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java index 843a1c7db1f..2317435947e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/InfinityMatcher.java @@ -42,6 +42,6 @@ public class InfinityMatcher extends SymbolMatcher { @Override public String toString() { - return ""; + return ""; } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java index ae5650c3119..5060b9518d3 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java @@ -92,6 +92,8 @@ public class NumberParserImpl { parser.addMatcher(PercentMatcher.getInstance(symbols)); parser.addMatcher(PermilleMatcher.getInstance(symbols)); parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags)); + parser.addMatcher(InfinityMatcher.getInstance(symbols)); + parser.addMatcher(PaddingMatcher.getInstance("@")); parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper)); parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); parser.addMatcher(new RequireNumberMatcher()); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java index bf15d726b7a..94f3035574c 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java @@ -5,8 +5,9 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.text.UnicodeSet; /** - * @author sffc + * A base class for many matchers that performs a simple match against a UnicodeString and/or UnicodeSet. * + * @author sffc */ public abstract class SymbolMatcher implements NumberParseMatcher { protected final String string; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java index 541ead1945a..912529479a5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java @@ -67,6 +67,10 @@ public class NumberParserTest { { 3, "‰51423", "0", 6, 51.423 }, { 3, "51423‰", "0", 6, 51.423 }, { 3, "51423‰‰", "0", 6, 51.423 }, + { 3, "∞", "0", 1, Double.POSITIVE_INFINITY }, + { 3, "-∞", "0", 2, Double.NEGATIVE_INFINITY }, + { 3, "@@@123 @@", "0", 6, 123. }, // TODO: Should padding be strong instead of weak? + { 3, "@@@123@@ ", "0", 6, 123. }, // TODO: Should padding be strong instead of weak? { 3, "a51423US dollars", "a0¤¤¤", 16, 51423. }, { 3, "a 51423 US dollars", "a0¤¤¤", 18, 51423. }, { 3, "514.23 USD", "¤0", 10, 514.23 }, -- 2.40.0