From 01916cad11f284b2423846ab4ebfa47f33bc6f76 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Wed, 21 Mar 2018 06:30:29 +0000 Subject: [PATCH] ICU-13634 Changes NumberParseMatcher getLeadCodePoints() to smokeTest() in C++ and Java. The new method is more versatile and eliminates the requirement to maintain two code paths for "lead chars" and "no lead chars". X-SVN-Rev: 41131 --- icu4c/source/i18n/numparse_affixes.cpp | 28 ++------ icu4c/source/i18n/numparse_affixes.h | 4 +- icu4c/source/i18n/numparse_compositions.cpp | 29 ++++++-- icu4c/source/i18n/numparse_compositions.h | 8 ++- icu4c/source/i18n/numparse_currency.cpp | 42 +++--------- icu4c/source/i18n/numparse_currency.h | 8 +-- icu4c/source/i18n/numparse_decimal.cpp | 28 ++++---- icu4c/source/i18n/numparse_decimal.h | 2 +- icu4c/source/i18n/numparse_impl.cpp | 47 +++---------- icu4c/source/i18n/numparse_impl.h | 11 +-- icu4c/source/i18n/numparse_scientific.cpp | 20 ++---- icu4c/source/i18n/numparse_scientific.h | 2 +- icu4c/source/i18n/numparse_stringsegment.cpp | 13 +++- icu4c/source/i18n/numparse_symbols.cpp | 27 +------- icu4c/source/i18n/numparse_symbols.h | 4 +- icu4c/source/i18n/numparse_types.h | 35 +++++----- icu4c/source/i18n/numparse_unisets.cpp | 3 - icu4c/source/i18n/numparse_unisets.h | 2 - icu4c/source/i18n/numparse_validators.h | 6 +- .../source/test/intltest/numbertest_parse.cpp | 7 +- .../test/intltest/numbertest_unisets.cpp | 7 -- .../src/com/ibm/icu/impl/StringSegment.java | 13 ++++ .../icu/impl/number/parse/AffixMatcher.java | 13 +--- .../ibm/icu/impl/number/parse/AnyMatcher.java | 17 ++--- .../impl/number/parse/CodePointMatcher.java | 5 +- .../number/parse/CurrencyCustomMatcher.java | 8 +-- .../number/parse/CurrencyNamesMatcher.java | 20 ++++-- .../icu/impl/number/parse/DecimalMatcher.java | 24 ++++--- .../ibm/icu/impl/number/parse/NanMatcher.java | 12 ---- .../impl/number/parse/NumberParseMatcher.java | 17 +++-- .../impl/number/parse/NumberParserImpl.java | 35 ++-------- .../impl/number/parse/ScientificMatcher.java | 11 +-- .../icu/impl/number/parse/SeriesMatcher.java | 7 +- .../icu/impl/number/parse/SymbolMatcher.java | 12 +--- .../number/parse/UnicodeSetStaticCache.java | 6 -- .../impl/number/parse/ValidationMatcher.java | 5 +- .../dev/test/number/ExhaustiveNumberTest.java | 60 +++++++++++++++++ .../icu/dev/test/number/NumberParserTest.java | 7 +- .../icu/dev/test/number/PropertiesTest.java | 2 +- .../number/UnicodeSetStaticCacheTest.java | 67 +------------------ 40 files changed, 266 insertions(+), 408 deletions(-) diff --git a/icu4c/source/i18n/numparse_affixes.cpp b/icu4c/source/i18n/numparse_affixes.cpp index e4dd8b76626..159d70dcda9 100644 --- a/icu4c/source/i18n/numparse_affixes.cpp +++ b/icu4c/source/i18n/numparse_affixes.cpp @@ -206,21 +206,15 @@ CodePointMatcher::CodePointMatcher(UChar32 cp) : fCp(cp) {} bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { - if (segment.matches(fCp)) { + if (segment.startsWith(fCp)) { segment.adjustOffsetByCodePoint(); result.setCharsConsumed(segment); } return false; } -const UnicodeSet& CodePointMatcher::getLeadCodePoints() { - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - leadCodePoints->add(fCp); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool CodePointMatcher::smokeTest(const StringSegment& segment) const { + return segment.startsWith(fCp); } UnicodeString CodePointMatcher::toString() const { @@ -427,19 +421,9 @@ bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCod } } -const UnicodeSet& AffixMatcher::getLeadCodePoints() { - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - if (fPrefix != nullptr) { - leadCodePoints->addAll(fPrefix->getLeadCodePoints()); - } - if (fSuffix != nullptr) { - leadCodePoints->addAll(fSuffix->getLeadCodePoints()); - } - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool AffixMatcher::smokeTest(const StringSegment& segment) const { + return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || + (fSuffix != nullptr && fSuffix->smokeTest(segment)); } void AffixMatcher::postProcess(ParsedNumber& result) const { diff --git a/icu4c/source/i18n/numparse_affixes.h b/icu4c/source/i18n/numparse_affixes.h index 8a09983c9e1..3b79457b6d3 100644 --- a/icu4c/source/i18n/numparse_affixes.h +++ b/icu4c/source/i18n/numparse_affixes.h @@ -35,7 +35,7 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; @@ -190,7 +190,7 @@ class AffixMatcher : public NumberParseMatcher, public UMemory { void postProcess(ParsedNumber& result) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; int8_t compareTo(const AffixMatcher& rhs) const; diff --git a/icu4c/source/i18n/numparse_compositions.cpp b/icu4c/source/i18n/numparse_compositions.cpp index 11e1cbcdf04..d254c07349d 100644 --- a/icu4c/source/i18n/numparse_compositions.cpp +++ b/icu4c/source/i18n/numparse_compositions.cpp @@ -38,9 +38,19 @@ bool AnyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& return maybeMore; } +bool AnyMatcher::smokeTest(const StringSegment& segment) const { + // NOTE: The range-based for loop calls the virtual begin() and end() methods. + for (auto& matcher : *this) { + if (matcher->smokeTest(segment)) { + return true; + } + } + return false; +} + void AnyMatcher::postProcess(ParsedNumber& result) const { // NOTE: The range-based for loop calls the virtual begin() and end() methods. - for (auto* matcher : *this) { + for (auto& matcher : *this) { matcher->postProcess(result); } } @@ -83,6 +93,17 @@ bool SeriesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo return maybeMore; } +bool SeriesMatcher::smokeTest(const StringSegment& segment) const { + // NOTE: The range-based for loop calls the virtual begin() and end() methods. + // NOTE: We only want the first element. Use the for loop for boundary checking. + for (auto& matcher : *this) { + // SeriesMatchers are never allowed to start with a Flexible matcher. + U_ASSERT(!matcher->isFlexible()); + return matcher->smokeTest(segment); + } + return false; +} + void SeriesMatcher::postProcess(ParsedNumber& result) const { // NOTE: The range-based for loop calls the virtual begin() and end() methods. for (auto* matcher : *this) { @@ -99,12 +120,6 @@ ArraySeriesMatcher::ArraySeriesMatcher(MatcherArray& matchers, int32_t matchersL : fMatchers(std::move(matchers)), fMatchersLen(matchersLen) { } -const UnicodeSet& ArraySeriesMatcher::getLeadCodePoints() { - // SeriesMatchers are never allowed to start with a Flexible matcher. - U_ASSERT(!fMatchers[0]->isFlexible()); - return fMatchers[0]->getLeadCodePoints(); -} - int32_t ArraySeriesMatcher::length() const { return fMatchersLen; } diff --git a/icu4c/source/i18n/numparse_compositions.h b/icu4c/source/i18n/numparse_compositions.h index 91db6fa2caa..8d61ab46d32 100644 --- a/icu4c/source/i18n/numparse_compositions.h +++ b/icu4c/source/i18n/numparse_compositions.h @@ -42,6 +42,8 @@ class AnyMatcher : public CompositionMatcher { public: bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; + bool smokeTest(const StringSegment& segment) const override; + void postProcess(ParsedNumber& result) const override; protected: @@ -61,6 +63,8 @@ class SeriesMatcher : public CompositionMatcher { public: bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; + bool smokeTest(const StringSegment& segment) const override; + void postProcess(ParsedNumber& result) const override; virtual int32_t length() const = 0; @@ -80,13 +84,11 @@ class ArraySeriesMatcher : public SeriesMatcher { public: ArraySeriesMatcher(); // WARNING: Leaves the object in an unusable state - typedef MaybeStackArray MatcherArray; + typedef MaybeStackArray MatcherArray; /** The array is std::move'd */ ArraySeriesMatcher(MatcherArray& matchers, int32_t matchersLen); - const UnicodeSet& getLeadCodePoints() override; - UnicodeString toString() const override; int32_t length() const override; diff --git a/icu4c/source/i18n/numparse_currency.cpp b/icu4c/source/i18n/numparse_currency.cpp index 064ba73fc00..61f07acce2f 100644 --- a/icu4c/source/i18n/numparse_currency.cpp +++ b/icu4c/source/i18n/numparse_currency.cpp @@ -21,7 +21,12 @@ using namespace icu::numparse::impl; CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status) - : fLocaleName(locale.getName(), -1, status) {} + : fLocaleName(locale.getName(), -1, status) { + uprv_currencyLeads(fLocaleName.data(), fLeadCodePoints, status); + // Always apply case mapping closure for currencies + fLeadCodePoints.closeOver(USET_ADD_CASE_MAPPINGS); + fLeadCodePoints.freeze(); +} bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { if (result.currencyCode[0] != 0) { @@ -57,17 +62,8 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U return partialMatch; } -const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() { - if (fLocalLeadCodePoints.isNull()) { - ErrorCode status; - auto* leadCodePoints = new UnicodeSet(); - uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status); - // Always apply case mapping closure for currencies - leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool CurrencyNamesMatcher::smokeTest(const StringSegment& segment) const { + return segment.startsWith(fLeadCodePoints); } UnicodeString CurrencyNamesMatcher::toString() const { @@ -103,15 +99,8 @@ bool CurrencyCustomMatcher::match(StringSegment& segment, ParsedNumber& result, return overlap1 == segment.length() || overlap2 == segment.length(); } -const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() { - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - utils::putLeadCodePoint(fCurrency1, leadCodePoints); - utils::putLeadCodePoint(fCurrency2, leadCodePoints); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool CurrencyCustomMatcher::smokeTest(const StringSegment& segment) const { + return segment.startsWith(fCurrency1) || segment.startsWith(fCurrency2); } UnicodeString CurrencyCustomMatcher::toString() const { @@ -144,17 +133,6 @@ CurrencyAnyMatcher& CurrencyAnyMatcher::operator=(CurrencyAnyMatcher&& src) U_NO return *this; } -const UnicodeSet& CurrencyAnyMatcher::getLeadCodePoints() { - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - leadCodePoints->addAll(fNamesMatcher.getLeadCodePoints()); - leadCodePoints->addAll(fCustomMatcher.getLeadCodePoints()); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; -} - const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const { return fMatcherArray; } diff --git a/icu4c/source/i18n/numparse_currency.h b/icu4c/source/i18n/numparse_currency.h index dd0490a9af5..1c2a57d2c14 100644 --- a/icu4c/source/i18n/numparse_currency.h +++ b/icu4c/source/i18n/numparse_currency.h @@ -11,6 +11,7 @@ #include "numparse_compositions.h" #include "charstr.h" #include "number_currencysymbols.h" +#include "unicode/uniset.h" U_NAMESPACE_BEGIN namespace numparse { namespace impl { @@ -32,7 +33,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; @@ -41,6 +42,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory { // Locale has a non-trivial default constructor. CharString fLocaleName; + UnicodeSet fLeadCodePoints; }; @@ -52,7 +54,7 @@ class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; @@ -78,8 +80,6 @@ class CurrencyAnyMatcher : public AnyMatcher, public UMemory { CurrencyAnyMatcher& operator=(CurrencyAnyMatcher&& src) U_NOEXCEPT; - const UnicodeSet& getLeadCodePoints() override; - UnicodeString toString() const override; protected: diff --git a/icu4c/source/i18n/numparse_decimal.cpp b/icu4c/source/i18n/numparse_decimal.cpp index 33b6821ff2c..7ce0b190441 100644 --- a/icu4c/source/i18n/numparse_decimal.cpp +++ b/icu4c/source/i18n/numparse_decimal.cpp @@ -295,25 +295,23 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t return segment.length() == 0 || hasPartialPrefix; } -const UnicodeSet& DecimalMatcher::getLeadCodePoints() { +bool DecimalMatcher::smokeTest(const StringSegment& segment) const { + // The common case uses a static leadSet for efficiency. if (fLocalDigitStrings.isNull() && leadSet != nullptr) { - return *leadSet; + return segment.startsWith(*leadSet); } - - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - // Assumption: the sets are all single code points. - leadCodePoints->addAll(*unisets::get(unisets::DIGITS)); - leadCodePoints->addAll(*separatorSet); - if (!fLocalDigitStrings.isNull()) { - for (int i = 0; i < 10; i++) { - utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints); - } + if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) { + return true; + } + if (fLocalDigitStrings.isNull()) { + return false; + } + for (int i = 0; i < 10; i++) { + if (segment.startsWith(fLocalDigitStrings[i])) { + return true; } - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); } - return *fLocalLeadCodePoints; + return false; } UnicodeString DecimalMatcher::toString() const { diff --git a/icu4c/source/i18n/numparse_decimal.h b/icu4c/source/i18n/numparse_decimal.h index 78ad074f190..f3ddcd8d61a 100644 --- a/icu4c/source/i18n/numparse_decimal.h +++ b/icu4c/source/i18n/numparse_decimal.h @@ -27,7 +27,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, UErrorCode& status) const; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 32d904f99a7..82c69d40249 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -34,7 +34,7 @@ NumberParserImpl* NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString, parse_flags_t parseFlags, UErrorCode& status) { - LocalPointer parser(new NumberParserImpl(parseFlags, true)); + LocalPointer parser(new NumberParserImpl(parseFlags)); DecimalFormatSymbols symbols(locale, status); parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES}; @@ -117,7 +117,7 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr } IgnorablesMatcher ignorables(isStrict ? unisets::DEFAULT_IGNORABLES : unisets::STRICT_IGNORABLES); - LocalPointer parser(new NumberParserImpl(parseFlags, status)); + LocalPointer parser(new NumberParserImpl(parseFlags)); ////////////////////// /// AFFIX MATCHERS /// @@ -197,52 +197,22 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr return parser.orphan(); } -NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags, bool computeLeads) - : fParseFlags(parseFlags), fComputeLeads(computeLeads) { +NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags) + : fParseFlags(parseFlags) { } NumberParserImpl::~NumberParserImpl() { - if (fComputeLeads) { - for (int32_t i = 0; i < fNumMatchers; i++) { - delete (fLeads[i]); - } - } fNumMatchers = 0; } void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) { if (fNumMatchers + 1 > fMatchers.getCapacity()) { fMatchers.resize(fNumMatchers * 2, fNumMatchers); - if (fComputeLeads) { - // The two arrays should grow in tandem: - U_ASSERT(fNumMatchers >= fLeads.getCapacity()); - fLeads.resize(fNumMatchers * 2, fNumMatchers); - } } - fMatchers[fNumMatchers] = &matcher; - - if (fComputeLeads) { - addLeadCodePointsForMatcher(matcher); - } - fNumMatchers++; } -void NumberParserImpl::addLeadCodePointsForMatcher(NumberParseMatcher& matcher) { - const UnicodeSet& leadCodePoints = matcher.getLeadCodePoints(); - // TODO: Avoid the clone operation here. - if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) { - auto* copy = dynamic_cast(leadCodePoints.cloneAsThawed()); - copy->closeOver(USET_ADD_CASE_MAPPINGS); - copy->freeze(); - fLeads[fNumMatchers] = copy; - } else { - // FIXME: new here because we still take ownership - fLeads[fNumMatchers] = new UnicodeSet(leadCodePoints); - } -} - void NumberParserImpl::freeze() { fFrozen = true; } @@ -276,12 +246,11 @@ void NumberParserImpl::parseGreedyRecursive(StringSegment& segment, ParsedNumber } int initialOffset = segment.getOffset(); - int leadCp = segment.getCodePoint(); for (int32_t i = 0; i < fNumMatchers; i++) { - if (fComputeLeads && !fLeads[i]->contains(leadCp)) { + const NumberParseMatcher* matcher = fMatchers[i]; + if (!matcher->smokeTest(segment)) { continue; } - const NumberParseMatcher* matcher = fMatchers[i]; matcher->match(segment, result, status); if (U_FAILURE(status)) { return; @@ -313,8 +282,10 @@ void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumbe int initialOffset = segment.getOffset(); for (int32_t i = 0; i < fNumMatchers; i++) { - // TODO: Check leadChars here? const NumberParseMatcher* matcher = fMatchers[i]; + if (!matcher->smokeTest(segment)) { + continue; + } // In a non-greedy parse, we attempt all possible matches and pick the best. for (int32_t charsToConsume = 0; charsToConsume < segment.length();) { diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index d05d7a24ac1..e740038414a 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -32,6 +32,11 @@ class NumberParserImpl : public MutableMatcherCollection { const number::impl::DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, bool parseCurrency, bool optimize, UErrorCode& status); + /** + * Does NOT take ownership of the matcher. The matcher MUST remain valid for the lifespan of the + * NumberParserImpl. + * @param matcher The matcher to reference. + */ void addMatcher(NumberParseMatcher& matcher) override; void freeze(); @@ -48,8 +53,6 @@ class NumberParserImpl : public MutableMatcherCollection { int32_t fNumMatchers = 0; // NOTE: The stack capacity for fMatchers and fLeads should be the same MaybeStackArray fMatchers; - MaybeStackArray fLeads; - bool fComputeLeads; bool fFrozen = false; // WARNING: All of these matchers start in an undefined state (default-constructed). @@ -78,9 +81,7 @@ class NumberParserImpl : public MutableMatcherCollection { RequireNumberValidator number; } fLocalValidators; - NumberParserImpl(parse_flags_t parseFlags, bool computeLeads); - - void addLeadCodePointsForMatcher(NumberParseMatcher& matcher); + explicit NumberParserImpl(parse_flags_t parseFlags); void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const; diff --git a/icu4c/source/i18n/numparse_scientific.cpp b/icu4c/source/i18n/numparse_scientific.cpp index bcb777101a4..7e2dc52c949 100644 --- a/icu4c/source/i18n/numparse_scientific.cpp +++ b/icu4c/source/i18n/numparse_scientific.cpp @@ -44,10 +44,10 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr // Allow a sign, and then try to match digits. int8_t exponentSign = 1; - if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) { + if (segment.startsWith(*unisets::get(unisets::MINUS_SIGN))) { exponentSign = -1; segment.adjustOffsetByCodePoint(); - } else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) { + } else if (segment.startsWith(*unisets::get(unisets::PLUS_SIGN))) { segment.adjustOffsetByCodePoint(); } @@ -71,20 +71,8 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr return false; } -const UnicodeSet& ScientificMatcher::getLeadCodePoints() { - UChar32 leadCp = fExponentSeparatorString.char32At(0); - const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD); - if (s->contains(leadCp)) { - return *s; - } - - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - leadCodePoints->add(leadCp); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool ScientificMatcher::smokeTest(const StringSegment& segment) const { + return segment.startsWith(fExponentSeparatorString); } UnicodeString ScientificMatcher::toString() const { diff --git a/icu4c/source/i18n/numparse_scientific.h b/icu4c/source/i18n/numparse_scientific.h index 0265c106087..4084fda1a74 100644 --- a/icu4c/source/i18n/numparse_scientific.h +++ b/icu4c/source/i18n/numparse_scientific.h @@ -25,7 +25,7 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; diff --git a/icu4c/source/i18n/numparse_stringsegment.cpp b/icu4c/source/i18n/numparse_stringsegment.cpp index b2dcdc36423..a3d6f15102d 100644 --- a/icu4c/source/i18n/numparse_stringsegment.cpp +++ b/icu4c/source/i18n/numparse_stringsegment.cpp @@ -75,11 +75,11 @@ UChar32 StringSegment::getCodePoint() const { } } -bool StringSegment::matches(UChar32 otherCp) const { +bool StringSegment::startsWith(UChar32 otherCp) const { return codePointsEqual(getCodePoint(), otherCp, fFoldCase); } -bool StringSegment::matches(const UnicodeSet& uniset) const { +bool StringSegment::startsWith(const UnicodeSet& uniset) const { // TODO: Move UnicodeSet case-folding logic here. // TODO: Handle string matches here instead of separately. UChar32 cp = getCodePoint(); @@ -89,6 +89,15 @@ bool StringSegment::matches(const UnicodeSet& uniset) const { return uniset.contains(cp); } +bool StringSegment::startsWith(const UnicodeString& other) const { + if (other.isBogus() || other.length() == 0 || length() == 0) { + return false; + } + int cp1 = getCodePoint(); + int cp2 = other.char32At(0); + return codePointsEqual(cp1, cp2, fFoldCase); +} + int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) { return getPrefixLengthInternal(other, fFoldCase); } diff --git a/icu4c/source/i18n/numparse_symbols.cpp b/icu4c/source/i18n/numparse_symbols.cpp index 0f35e313536..51922752b62 100644 --- a/icu4c/source/i18n/numparse_symbols.cpp +++ b/icu4c/source/i18n/numparse_symbols.cpp @@ -58,20 +58,8 @@ bool SymbolMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo return overlap == segment.length(); } -const UnicodeSet& SymbolMatcher::getLeadCodePoints() { - if (fString.isEmpty()) { - // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints. - return *fUniSet; - } - - if (fLocalLeadCodePoints.isNull()) { - auto* leadCodePoints = new UnicodeSet(); - utils::putLeadCodePoints(fUniSet, leadCodePoints); - utils::putLeadCodePoint(fString, leadCodePoints); - leadCodePoints->freeze(); - fLocalLeadCodePoints.adoptInstead(leadCodePoints); - } - return *fLocalLeadCodePoints; +bool SymbolMatcher::smokeTest(const StringSegment& segment) const { + return segment.startsWith(*fUniSet) || segment.startsWith(fString); } UnicodeString SymbolMatcher::toString() const { @@ -134,17 +122,6 @@ NanMatcher::NanMatcher(const DecimalFormatSymbols& dfs) : SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::EMPTY) { } -const UnicodeSet& NanMatcher::getLeadCodePoints() { - // Overriding this here to allow use of statically allocated sets - int leadCp = fString.char32At(0); - const UnicodeSet* s = unisets::get(unisets::NAN_LEAD); - if (s->contains(leadCp)) { - return *s; - } - - return SymbolMatcher::getLeadCodePoints(); -} - bool NanMatcher::isDisabled(const ParsedNumber& result) const { return result.seenNumber(); } diff --git a/icu4c/source/i18n/numparse_symbols.h b/icu4c/source/i18n/numparse_symbols.h index 2cc107101da..7264734fc0d 100644 --- a/icu4c/source/i18n/numparse_symbols.h +++ b/icu4c/source/i18n/numparse_symbols.h @@ -28,7 +28,7 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory { bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - const UnicodeSet& getLeadCodePoints() override; + bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; @@ -96,8 +96,6 @@ class NanMatcher : public SymbolMatcher { NanMatcher(const DecimalFormatSymbols& dfs); - const UnicodeSet& getLeadCodePoints() override; - protected: bool isDisabled(const ParsedNumber& result) const override; diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h index 3f27da05e28..2f8d31fc76f 100644 --- a/icu4c/source/i18n/numparse_types.h +++ b/icu4c/source/i18n/numparse_types.h @@ -26,7 +26,7 @@ enum ResultFlags { FLAG_PERCENT = 0x0002, FLAG_PERMILLE = 0x0004, FLAG_HAS_EXPONENT = 0x0008, - FLAG_HAS_DEFAULT_CURRENCY = 0x0010, + // FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used FLAG_HAS_DECIMAL_SEPARATOR = 0x0020, FLAG_NAN = 0x0040, FLAG_INFINITY = 0x0080, @@ -46,6 +46,7 @@ enum ParseFlags { PARSE_FLAG_USE_FULL_AFFIXES = 0x0100, PARSE_FLAG_EXACT_AFFIX = 0x0200, PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400, + PARSE_FLAG_OPTIMIZE = 0x0800, }; @@ -216,12 +217,18 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence { *

* This method will perform case folding if case folding is enabled for the parser. */ - bool matches(UChar32 otherCp) const; + bool startsWith(UChar32 otherCp) const; /** * Returns true if the first code point of this StringSegment is in the given UnicodeSet. */ - bool matches(const UnicodeSet& uniset) const; + bool startsWith(const UnicodeSet& uniset) const; + + /** + * Returns true if there is at least one code point of overlap between this StringSegment and the + * given UnicodeString. + */ + bool startsWith(const UnicodeString& other) const; /** * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For @@ -294,17 +301,18 @@ class NumberParseMatcher { virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0; /** - * Should return a set representing all possible chars (UTF-16 code units) that could be the first - * char that this matcher can consume. This method is only called during construction phase, and its - * return value is used to skip this matcher unless a segment begins with a char in this set. To make - * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}. + * Performs a fast "smoke check" for whether or not this matcher could possibly match against the + * given string segment. The test should be as fast as possible but also as restrictive as possible. + * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a + * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly + * handle case folding. * - * The returned UnicodeSet does not need adoption and is guaranteed to be alive for as long as the - * object that returned it. - * - * This method is NOT thread-safe. + * @param segment + * The segment to check against. + * @return true if the matcher might be able to match against this segment; false if it definitely + * will not be able to match. */ - virtual const UnicodeSet& getLeadCodePoints() = 0; + virtual bool smokeTest(const StringSegment& segment) const = 0; /** * Method called at the end of a parse, after all matchers have failed to consume any more chars. @@ -324,9 +332,6 @@ class NumberParseMatcher { protected: // No construction except by subclasses! NumberParseMatcher() = default; - - // Optional ownership of the leadCodePoints set - LocalPointer fLocalLeadCodePoints; }; diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp index 3e7c9b0253f..eb2f6c1e9d5 100644 --- a/icu4c/source/i18n/numparse_unisets.cpp +++ b/icu4c/source/i18n/numparse_unisets.cpp @@ -93,9 +93,6 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status); gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); - gUnicodeSets[NAN_LEAD] = new UnicodeSet( - u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status); - gUnicodeSets[SCIENTIFIC_LEAD] = new UnicodeSet(u"[Ee×·е\u0627]", status); gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status); gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); diff --git a/icu4c/source/i18n/numparse_unisets.h b/icu4c/source/i18n/numparse_unisets.h index d66d4adab91..97a44ea860d 100644 --- a/icu4c/source/i18n/numparse_unisets.h +++ b/icu4c/source/i18n/numparse_unisets.h @@ -47,8 +47,6 @@ enum Key { // Other DIGITS, - NAN_LEAD, - SCIENTIFIC_LEAD, CWCF, // Combined Separators with Digits (for lead code points) diff --git a/icu4c/source/i18n/numparse_validators.h b/icu4c/source/i18n/numparse_validators.h index d158b234fd5..817ec9cb8d6 100644 --- a/icu4c/source/i18n/numparse_validators.h +++ b/icu4c/source/i18n/numparse_validators.h @@ -21,12 +21,12 @@ class ValidationMatcher : public NumberParseMatcher { return false; } - const UnicodeSet& getLeadCodePoints() U_OVERRIDE { + bool smokeTest(const StringSegment&) const U_OVERRIDE { // No-op - return *unisets::get(unisets::EMPTY); + return false; } - virtual void postProcess(ParsedNumber& result) const U_OVERRIDE = 0; + void postProcess(ParsedNumber& result) const U_OVERRIDE = 0; }; diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 160bab4fbcf..d97141a489a 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -176,10 +176,9 @@ void NumberParserTest::testSeriesMatcher() { matchers[4] = &m4; ArraySeriesMatcher series(matchers, 5); - assertEquals( - "Lead set should be equal to lead set of lead matcher", - *unisets::get(unisets::PLUS_SIGN), - series.getLeadCodePoints()); + assertFalse("", series.smokeTest(StringSegment(u"x", false))); + assertFalse("", series.smokeTest(StringSegment(u"-", false))); + assertTrue("", series.smokeTest(StringSegment(u"+", false))); static const struct TestCase { const char16_t* input; diff --git a/icu4c/source/test/intltest/numbertest_unisets.cpp b/icu4c/source/test/intltest/numbertest_unisets.cpp index f0623b2bd17..74f65e4b410 100644 --- a/icu4c/source/test/intltest/numbertest_unisets.cpp +++ b/icu4c/source/test/intltest/numbertest_unisets.cpp @@ -46,8 +46,6 @@ void UniSetsTest::testSetCoverage() { const UnicodeSet &percent = *get(unisets::PERCENT_SIGN); const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN); const UnicodeSet &infinity = *get(unisets::INFINITY_KEY); - const UnicodeSet &nanLead = *get(unisets::NAN_LEAD); - const UnicodeSet &scientificLead = *get(unisets::SCIENTIFIC_LEAD); int32_t localeCount; const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount); @@ -66,11 +64,6 @@ void UniSetsTest::testSetCoverage() { ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol)); ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol)); ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol)); - ASSERT_IN_SET(nanLead, dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0)); - ASSERT_IN_SET(nanLead, - u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0), 0)); - ASSERT_IN_SET(scientificLead, - u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol).char32At(0), 0)); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/StringSegment.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/StringSegment.java index 750ad1d4bdd..7808bddc6b0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/StringSegment.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/StringSegment.java @@ -134,6 +134,19 @@ public class StringSegment implements CharSequence { return uniset.contains(cp); } + /** + * Returns true if there is at least one code point of overlap between this StringSegment and the + * given CharSequence. Null-safe. + */ + public boolean startsWith(CharSequence other) { + if (other == null || other.length() == 0 || length() == 0) { + return false; + } + int cp1 = Character.codePointAt(this, 0); + int cp2 = Character.codePointAt(other, 0); + return codePointsEqual(cp1, cp2, foldCase); + } + /** * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java index e8c0acfb8e1..6f511581407 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java @@ -13,7 +13,6 @@ import com.ibm.icu.impl.number.AffixPatternProvider; import com.ibm.icu.impl.number.AffixUtils; import com.ibm.icu.impl.number.PatternStringUtils; import com.ibm.icu.number.NumberFormatter.SignDisplay; -import com.ibm.icu.text.UnicodeSet; /** * @author sffc @@ -206,15 +205,9 @@ public class AffixMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - UnicodeSet leadCodePoints = new UnicodeSet(); - if (prefix != null) { - leadCodePoints.addAll(prefix.getLeadCodePoints()); - } - if (suffix != null) { - leadCodePoints.addAll(suffix.getLeadCodePoints()); - } - return leadCodePoints.freeze(); + public boolean smokeTest(StringSegment segment) { + return (prefix != null && prefix.smokeTest(segment)) + || (suffix != null && suffix.smokeTest(segment)); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java index 005f8cc9b5f..e5359ead47c 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java @@ -6,7 +6,6 @@ import java.util.ArrayList; import java.util.List; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; /** * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses @@ -58,22 +57,18 @@ public class AnyMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { + public boolean smokeTest(StringSegment segment) { assert frozen; if (matchers == null) { - return UnicodeSet.EMPTY; - } - - if (matchers.size() == 1) { - return matchers.get(0).getLeadCodePoints(); + return false; } - UnicodeSet leadCodePoints = new UnicodeSet(); for (int i = 0; i < matchers.size(); i++) { - NumberParseMatcher matcher = matchers.get(i); - leadCodePoints.addAll(matcher.getLeadCodePoints()); + if (matchers.get(i).smokeTest(segment)) { + return true; + } } - return leadCodePoints.freeze(); + return false; } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java index afa6f68a661..8e0fc60fb0e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java @@ -3,7 +3,6 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; /** * Matches a single code point, performing no other logic. @@ -33,8 +32,8 @@ public class CodePointMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - return new UnicodeSet().add(cp).freeze(); + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(cp); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java index d008a0686cf..019af7504fc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java @@ -3,7 +3,6 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.Currency; import com.ibm.icu.util.ULocale; @@ -52,11 +51,8 @@ public class CurrencyCustomMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - UnicodeSet leadCodePoints = new UnicodeSet(); - ParsingUtils.putLeadCodePoint(currency1, leadCodePoints); - ParsingUtils.putLeadCodePoint(currency2, leadCodePoints); - return leadCodePoints.freeze(); + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(currency1) || segment.startsWith(currency2); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java index 0fcadbb9cbc..be2acd42f95 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java @@ -23,6 +23,8 @@ public class CurrencyNamesMatcher implements NumberParseMatcher { private final TextTrieMap longNameTrie; private final TextTrieMap symbolTrie; + private final UnicodeSet leadCodePoints; + public static CurrencyNamesMatcher getInstance(ULocale locale) { // TODO: Pre-compute some of the more popular locales? return new CurrencyNamesMatcher(locale); @@ -33,6 +35,15 @@ public class CurrencyNamesMatcher implements NumberParseMatcher { // case folding on long-names but not symbols. longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME); symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME); + + // Compute the full set of characters that could be the first in a currency to allow for + // efficient smoke test. + leadCodePoints = new UnicodeSet(); + longNameTrie.putLeadCodePoints(leadCodePoints); + symbolTrie.putLeadCodePoints(leadCodePoints); + // Always apply case mapping closure for currencies + leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS); + leadCodePoints.freeze(); } @Override @@ -55,13 +66,8 @@ public class CurrencyNamesMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - UnicodeSet leadCodePoints = new UnicodeSet(); - longNameTrie.putLeadCodePoints(leadCodePoints); - symbolTrie.putLeadCodePoints(leadCodePoints); - // Always apply case mapping closure for currencies - leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS); - return leadCodePoints.freeze(); + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(leadCodePoints); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java index 9ba745d97d5..5a622611daa 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java @@ -329,21 +329,23 @@ public class DecimalMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { + public boolean smokeTest(StringSegment segment) { + // The common case uses a static leadSet for efficiency. if (digitStrings == null && leadSet != null) { - return leadSet; + return segment.startsWith(leadSet); } - - UnicodeSet leadCodePoints = new UnicodeSet(); - // Assumption: the sets are all single code points. - leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS)); - leadCodePoints.addAll(separatorSet); - if (digitStrings != null) { - for (int i = 0; i < digitStrings.length; i++) { - ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints); + if (segment.startsWith(separatorSet) || UCharacter.isDigit(segment.getCodePoint())) { + return true; + } + if (digitStrings == null) { + return false; + } + for (int i = 0; i < digitStrings.length; i++) { + if (segment.startsWith(digitStrings[i])) { + return true; } } - return leadCodePoints.freeze(); + return false; } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java index f78d0e80d1b..46a6773a553 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java @@ -27,18 +27,6 @@ public class NanMatcher extends SymbolMatcher { super(symbolString, UnicodeSet.EMPTY); } - @Override - public UnicodeSet getLeadCodePoints() { - // Overriding this here to allow use of statically allocated sets - int leadCp = string.codePointAt(0); - UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.NAN_LEAD); - if (s.contains(leadCp)) { - return s; - } else { - return super.getLeadCodePoints(); - } - } - @Override protected boolean isDisabled(ParsedNumber result) { return result.seenNumber(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java index 9bdc289b166..14e9c7e1d22 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java @@ -3,7 +3,6 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; /** * The core interface implemented by all matchers used for number parsing. @@ -45,12 +44,18 @@ public interface NumberParseMatcher { public boolean match(StringSegment segment, ParsedNumber result); /** - * Should return a set representing all possible chars (UTF-16 code units) that could be the first - * char that this matcher can consume. This method is only called during construction phase, and its - * return value is used to skip this matcher unless a segment begins with a char in this set. To make - * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}. + * Performs a fast "smoke check" for whether or not this matcher could possibly match against the + * given string segment. The test should be as fast as possible but also as restrictive as possible. + * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a + * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly + * handle case folding. + * + * @param segment + * The segment to check against. + * @return true if the matcher might be able to match against this segment; false if it definitely + * will not be able to match. */ - public UnicodeSet getLeadCodePoints(); + public boolean smokeTest(StringSegment segment); /** * Method called at the end of a parse, after all matchers have failed to consume any more chars. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java index 647cb9bb162..ffd101fa01a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java @@ -19,7 +19,6 @@ import com.ibm.icu.impl.number.PropertiesAffixPatternProvider; import com.ibm.icu.impl.number.RoundingUtils; import com.ibm.icu.number.NumberFormatter.GroupingStrategy; import com.ibm.icu.text.DecimalFormatSymbols; -import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.Currency; import com.ibm.icu.util.CurrencyAmount; import com.ibm.icu.util.ULocale; @@ -250,7 +249,6 @@ public class NumberParserImpl { private final int parseFlags; private final List matchers; - private final List leads; private boolean frozen; /** @@ -261,11 +259,6 @@ public class NumberParserImpl { */ public NumberParserImpl(int parseFlags) { matchers = new ArrayList(); - if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_OPTIMIZE)) { - leads = new ArrayList(); - } else { - leads = null; - } this.parseFlags = parseFlags; frozen = false; } @@ -273,30 +266,11 @@ public class NumberParserImpl { public void addMatcher(NumberParseMatcher matcher) { assert !frozen; this.matchers.add(matcher); - if (leads != null) { - addLeadCodePointsForMatcher(matcher); - } } public void addMatchers(Collection matchers) { assert !frozen; this.matchers.addAll(matchers); - if (leads != null) { - for (NumberParseMatcher matcher : matchers) { - addLeadCodePointsForMatcher(matcher); - } - } - } - - private void addLeadCodePointsForMatcher(NumberParseMatcher matcher) { - UnicodeSet leadCodePoints = matcher.getLeadCodePoints(); - assert leadCodePoints.isFrozen(); - // TODO: Avoid the clone operation here. - if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE)) { - leadCodePoints = leadCodePoints.cloneAsThawed().closeOver(UnicodeSet.ADD_CASE_MAPPINGS) - .freeze(); - } - this.leads.add(leadCodePoints); } public void freeze() { @@ -343,12 +317,11 @@ public class NumberParserImpl { } int initialOffset = segment.getOffset(); - int leadCp = segment.getCodePoint(); for (int i = 0; i < matchers.size(); i++) { - if (leads != null && !leads.get(i).contains(leadCp)) { + NumberParseMatcher matcher = matchers.get(i); + if (!matcher.smokeTest(segment)) { continue; } - NumberParseMatcher matcher = matchers.get(i); matcher.match(segment, result); if (segment.getOffset() != initialOffset) { // In a greedy parse, recurse on only the first match. @@ -377,8 +350,10 @@ public class NumberParserImpl { int initialOffset = segment.getOffset(); for (int i = 0; i < matchers.size(); i++) { - // TODO: Check leadChars here? NumberParseMatcher matcher = matchers.get(i); + if (!matcher.smokeTest(segment)) { + continue; + } // In a non-greedy parse, we attempt all possible matches and pick the best. for (int charsToConsume = 0; charsToConsume < segment.length();) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java index a297a50006e..8fdd5b5bd83 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java @@ -5,7 +5,6 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.impl.StringSegment; import com.ibm.icu.impl.number.Grouper; import com.ibm.icu.text.DecimalFormatSymbols; -import com.ibm.icu.text.UnicodeSet; /** * @author sffc @@ -78,14 +77,8 @@ public class ScientificMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - int leadCp = exponentSeparatorString.codePointAt(0); - UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.SCIENTIFIC_LEAD); - if (s.contains(leadCp)) { - return s; - } else { - return new UnicodeSet().add(leadCp).freeze(); - } + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(exponentSeparatorString); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeriesMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeriesMatcher.java index 8c8c67f573c..f12a81fd351 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeriesMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeriesMatcher.java @@ -6,7 +6,6 @@ import java.util.ArrayList; import java.util.List; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; /** * Composes a number of matchers, running one after another. Matches the input string only if all of the @@ -82,15 +81,15 @@ public class SeriesMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { + public boolean smokeTest(StringSegment segment) { assert frozen; if (matchers == null) { - return UnicodeSet.EMPTY; + return false; } // SeriesMatchers are never allowed to start with a Flexible matcher. assert !(matchers.get(0) instanceof NumberParseMatcher.Flexible); - return matchers.get(0).getLeadCodePoints(); + return matchers.get(0).smokeTest(segment); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java index 8ca6d92f301..f5b27c1d124 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java @@ -59,16 +59,8 @@ public abstract class SymbolMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - if (string.isEmpty()) { - // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints. - return uniSet; - } - - UnicodeSet leadCodePoints = new UnicodeSet(); - ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints); - ParsingUtils.putLeadCodePoint(string, leadCodePoints); - return leadCodePoints.freeze(); + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(uniSet) || segment.startsWith(string); } @Override diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java index edc0e99114f..cba2dc93849 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java @@ -49,8 +49,6 @@ public class UnicodeSetStaticCache { // Other DIGITS, - NAN_LEAD, - SCIENTIFIC_LEAD, CWCF, // TODO: Check if this is being used and remove it if not. // Combined Separators with Digits (for lead code points) @@ -112,10 +110,6 @@ public class UnicodeSetStaticCache { unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze()); unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); - unicodeSets.put(Key.NAN_LEAD, - new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]") - .freeze()); - unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze()); unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze()); unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS)); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java index 913dcf83a1c..399e156c6bf 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java @@ -3,7 +3,6 @@ package com.ibm.icu.impl.number.parse; import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.text.UnicodeSet; /** * A Matcher used only for post-process validation, not for consuming characters at runtime. @@ -16,8 +15,8 @@ public abstract class ValidationMatcher implements NumberParseMatcher { } @Override - public UnicodeSet getLeadCodePoints() { - return UnicodeSet.EMPTY; + public boolean smokeTest(StringSegment segment) { + return false; } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java index e2cabc8e4a2..51dbe11ab88 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/ExhaustiveNumberTest.java @@ -2,6 +2,8 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.dev.test.number; +import static com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.get; + import java.math.BigDecimal; import java.util.Random; @@ -10,8 +12,12 @@ import org.junit.Test; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD; +import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.number.NumberFormatter; import com.ibm.icu.number.Rounder; +import com.ibm.icu.text.DecimalFormatSymbols; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; /** @@ -27,6 +33,60 @@ public class ExhaustiveNumberTest extends TestFmwk { org.junit.Assume.assumeTrue(getExhaustiveness() > 5); } + @Test + public void testSetCoverage() { + // Lenient comma/period should be supersets of strict comma/period; + // it also makes the coverage logic cheaper. + assertTrue("COMMA should be superset of STRICT_COMMA", + get(Key.COMMA).containsAll(get(Key.STRICT_COMMA))); + assertTrue("PERIOD should be superset of STRICT_PERIOD", + get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD))); + + UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD)) + .freeze(); + UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS)) + .freeze(); + UnicodeSet plusSign = get(Key.PLUS_SIGN); + UnicodeSet minusSign = get(Key.MINUS_SIGN); + UnicodeSet percent = get(Key.PERCENT_SIGN); + UnicodeSet permille = get(Key.PERMILLE_SIGN); + UnicodeSet infinity = get(Key.INFINITY); + + for (ULocale locale : ULocale.getAvailableLocales()) { + DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale); + + assertInSet(locale, decimals, dfs.getDecimalSeparatorString()); + assertInSet(locale, grouping, dfs.getGroupingSeparatorString()); + assertInSet(locale, plusSign, dfs.getPlusSignString()); + assertInSet(locale, minusSign, dfs.getMinusSignString()); + assertInSet(locale, percent, dfs.getPercentString()); + assertInSet(locale, permille, dfs.getPerMillString()); + assertInSet(locale, infinity, dfs.getInfinity()); + } + } + + static void assertInSet(ULocale locale, UnicodeSet set, String str) { + if (str.codePointCount(0, str.length()) != 1) { + // Ignore locale strings with more than one code point (usually a bidi mark) + return; + } + assertInSet(locale, set, str.codePointAt(0)); + } + + static void assertInSet(ULocale locale, UnicodeSet set, int cp) { + // If this test case fails, add the specified code point to the corresponding set in + // UnicodeSetStaticCache.java and numparse_unisets.cpp + assertTrue( + locale + + " U+" + + Integer.toHexString(cp) + + " (" + + UCharacter.toString(cp) + + ") should be in " + + set, + set.contains(cp)); + } + @Test public void unlimitedRoundingBigDecimal() { BigDecimal ten10000 = BigDecimal.valueOf(10).pow(10000); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java index a522a289974..6009be7d154 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java @@ -3,6 +3,7 @@ package com.ibm.icu.dev.test.number; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.junit.Test; @@ -21,8 +22,6 @@ import com.ibm.icu.impl.number.parse.ParsingUtils; import com.ibm.icu.impl.number.parse.PercentMatcher; import com.ibm.icu.impl.number.parse.PlusSignMatcher; import com.ibm.icu.impl.number.parse.SeriesMatcher; -import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache; -import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key; import com.ibm.icu.text.DecimalFormatSymbols; import com.ibm.icu.util.Currency; import com.ibm.icu.util.ULocale; @@ -196,7 +195,9 @@ public class NumberParserTest { series.addMatcher(IgnorablesMatcher.DEFAULT); series.freeze(); - assertEquals(UnicodeSetStaticCache.get(Key.PLUS_SIGN), series.getLeadCodePoints()); + assertFalse(series.smokeTest(new StringSegment("x", false))); + assertFalse(series.smokeTest(new StringSegment("-", false))); + assertTrue(series.smokeTest(new StringSegment("+", false))); Object[][] cases = new Object[][] { { "", 0, true }, diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/PropertiesTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/PropertiesTest.java index bb557cc970c..4529f6b9164 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/PropertiesTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/PropertiesTest.java @@ -30,9 +30,9 @@ import org.junit.Test; import com.ibm.icu.dev.test.serializable.SerializableTestUtility; import com.ibm.icu.impl.number.DecimalFormatProperties; +import com.ibm.icu.impl.number.DecimalFormatProperties.ParseMode; import com.ibm.icu.impl.number.Padder.PadPosition; import com.ibm.icu.impl.number.PatternStringParser; -import com.ibm.icu.impl.number.parse.NumberParserImpl.ParseMode; import com.ibm.icu.text.CompactDecimalFormat.CompactStyle; import com.ibm.icu.text.CurrencyPluralInfo; import com.ibm.icu.text.MeasureFormat.FormatWidth; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java index 7aec4f77f17..ca239267bb2 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java @@ -8,82 +8,17 @@ import static org.junit.Assert.assertTrue; import org.junit.Test; import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.DecimalFormatSymbols; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; /** + * This test class is thin; most of it was moved to ExhaustiveNumberTest. * @author sffc - * */ public class UnicodeSetStaticCacheTest { - @Test - public void testSetCoverage() { - // Lenient comma/period should be supersets of strict comma/period; - // it also makes the coverage logic cheaper. - assertTrue("COMMA should be superset of STRICT_COMMA", - get(Key.COMMA).containsAll(get(Key.STRICT_COMMA))); - assertTrue("PERIOD should be superset of STRICT_PERIOD", - get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD))); - - UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD)) - .freeze(); - UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS)) - .freeze(); - UnicodeSet plusSign = get(Key.PLUS_SIGN); - UnicodeSet minusSign = get(Key.MINUS_SIGN); - UnicodeSet percent = get(Key.PERCENT_SIGN); - UnicodeSet permille = get(Key.PERMILLE_SIGN); - UnicodeSet infinity = get(Key.INFINITY); - UnicodeSet nanLead = get(Key.NAN_LEAD); - UnicodeSet scientificLead = get(Key.SCIENTIFIC_LEAD); - - for (ULocale locale : ULocale.getAvailableLocales()) { - DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale); - - assertInSet(locale, decimals, dfs.getDecimalSeparatorString()); - assertInSet(locale, grouping, dfs.getGroupingSeparatorString()); - assertInSet(locale, plusSign, dfs.getPlusSignString()); - assertInSet(locale, minusSign, dfs.getMinusSignString()); - assertInSet(locale, percent, dfs.getPercentString()); - assertInSet(locale, permille, dfs.getPerMillString()); - assertInSet(locale, infinity, dfs.getInfinity()); - assertInSet(locale, nanLead, dfs.getNaN().codePointAt(0)); - assertInSet(locale, nanLead, UCharacter.foldCase(dfs.getNaN(), true).codePointAt(0)); - assertInSet(locale, - scientificLead, - UCharacter.foldCase(dfs.getExponentSeparator(), true).codePointAt(0)); - } - } - @Test public void testFrozen() { for (Key key : Key.values()) { assertTrue(get(key).isFrozen()); } } - - static void assertInSet(ULocale locale, UnicodeSet set, String str) { - if (str.codePointCount(0, str.length()) != 1) { - // Ignore locale strings with more than one code point (usually a bidi mark) - return; - } - assertInSet(locale, set, str.codePointAt(0)); - } - - static void assertInSet(ULocale locale, UnicodeSet set, int cp) { - // If this test case fails, add the specified code point to the corresponding set in - // UnicodeSetStaticCache.java and numparse_unisets.cpp - assertTrue( - locale - + " U+" - + Integer.toHexString(cp) - + " (" - + UCharacter.toString(cp) - + ") should be in " - + set, - set.contains(cp)); - } } -- 2.50.1