: fCp(cp) {}
bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
- if (segment.matches(fCp)) {
+ if (segment.startsWith(fCp)) {
segment.adjustOffsetByCodePoint();
result.setCharsConsumed(segment);
}
return false;
}
-const UnicodeSet& CodePointMatcher::getLeadCodePoints() {
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- leadCodePoints->add(fCp);
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
+ return segment.startsWith(fCp);
}
UnicodeString CodePointMatcher::toString() const {
}
}
-const UnicodeSet& AffixMatcher::getLeadCodePoints() {
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- if (fPrefix != nullptr) {
- leadCodePoints->addAll(fPrefix->getLeadCodePoints());
- }
- if (fSuffix != nullptr) {
- leadCodePoints->addAll(fSuffix->getLeadCodePoints());
- }
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool AffixMatcher::smokeTest(const StringSegment& segment) const {
+ return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
+ (fSuffix != nullptr && fSuffix->smokeTest(segment));
}
void AffixMatcher::postProcess(ParsedNumber& result) const {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
void postProcess(ParsedNumber& result) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
int8_t compareTo(const AffixMatcher& rhs) const;
return maybeMore;
}
+bool AnyMatcher::smokeTest(const StringSegment& segment) const {
+ // NOTE: The range-based for loop calls the virtual begin() and end() methods.
+ for (auto& matcher : *this) {
+ if (matcher->smokeTest(segment)) {
+ return true;
+ }
+ }
+ return false;
+}
+
void AnyMatcher::postProcess(ParsedNumber& result) const {
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
- for (auto* matcher : *this) {
+ for (auto& matcher : *this) {
matcher->postProcess(result);
}
}
return maybeMore;
}
+bool SeriesMatcher::smokeTest(const StringSegment& segment) const {
+ // NOTE: The range-based for loop calls the virtual begin() and end() methods.
+ // NOTE: We only want the first element. Use the for loop for boundary checking.
+ for (auto& matcher : *this) {
+ // SeriesMatchers are never allowed to start with a Flexible matcher.
+ U_ASSERT(!matcher->isFlexible());
+ return matcher->smokeTest(segment);
+ }
+ return false;
+}
+
void SeriesMatcher::postProcess(ParsedNumber& result) const {
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
for (auto* matcher : *this) {
: fMatchers(std::move(matchers)), fMatchersLen(matchersLen) {
}
-const UnicodeSet& ArraySeriesMatcher::getLeadCodePoints() {
- // SeriesMatchers are never allowed to start with a Flexible matcher.
- U_ASSERT(!fMatchers[0]->isFlexible());
- return fMatchers[0]->getLeadCodePoints();
-}
-
int32_t ArraySeriesMatcher::length() const {
return fMatchersLen;
}
public:
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+ bool smokeTest(const StringSegment& segment) const override;
+
void postProcess(ParsedNumber& result) const override;
protected:
public:
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
+ bool smokeTest(const StringSegment& segment) const override;
+
void postProcess(ParsedNumber& result) const override;
virtual int32_t length() const = 0;
public:
ArraySeriesMatcher(); // WARNING: Leaves the object in an unusable state
- typedef MaybeStackArray<NumberParseMatcher*, 3> MatcherArray;
+ typedef MaybeStackArray<const NumberParseMatcher*, 3> MatcherArray;
/** The array is std::move'd */
ArraySeriesMatcher(MatcherArray& matchers, int32_t matchersLen);
- const UnicodeSet& getLeadCodePoints() override;
-
UnicodeString toString() const override;
int32_t length() const override;
CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status)
- : fLocaleName(locale.getName(), -1, status) {}
+ : fLocaleName(locale.getName(), -1, status) {
+ uprv_currencyLeads(fLocaleName.data(), fLeadCodePoints, status);
+ // Always apply case mapping closure for currencies
+ fLeadCodePoints.closeOver(USET_ADD_CASE_MAPPINGS);
+ fLeadCodePoints.freeze();
+}
bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
if (result.currencyCode[0] != 0) {
return partialMatch;
}
-const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() {
- if (fLocalLeadCodePoints.isNull()) {
- ErrorCode status;
- auto* leadCodePoints = new UnicodeSet();
- uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
- // Always apply case mapping closure for currencies
- leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool CurrencyNamesMatcher::smokeTest(const StringSegment& segment) const {
+ return segment.startsWith(fLeadCodePoints);
}
UnicodeString CurrencyNamesMatcher::toString() const {
return overlap1 == segment.length() || overlap2 == segment.length();
}
-const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() {
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- utils::putLeadCodePoint(fCurrency1, leadCodePoints);
- utils::putLeadCodePoint(fCurrency2, leadCodePoints);
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool CurrencyCustomMatcher::smokeTest(const StringSegment& segment) const {
+ return segment.startsWith(fCurrency1) || segment.startsWith(fCurrency2);
}
UnicodeString CurrencyCustomMatcher::toString() const {
return *this;
}
-const UnicodeSet& CurrencyAnyMatcher::getLeadCodePoints() {
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- leadCodePoints->addAll(fNamesMatcher.getLeadCodePoints());
- leadCodePoints->addAll(fCustomMatcher.getLeadCodePoints());
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
-}
-
const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const {
return fMatcherArray;
}
#include "numparse_compositions.h"
#include "charstr.h"
#include "number_currencysymbols.h"
+#include "unicode/uniset.h"
U_NAMESPACE_BEGIN namespace numparse {
namespace impl {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
// Locale has a non-trivial default constructor.
CharString fLocaleName;
+ UnicodeSet fLeadCodePoints;
};
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
CurrencyAnyMatcher& operator=(CurrencyAnyMatcher&& src) U_NOEXCEPT;
- const UnicodeSet& getLeadCodePoints() override;
-
UnicodeString toString() const override;
protected:
return segment.length() == 0 || hasPartialPrefix;
}
-const UnicodeSet& DecimalMatcher::getLeadCodePoints() {
+bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
+ // The common case uses a static leadSet for efficiency.
if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
- return *leadSet;
+ return segment.startsWith(*leadSet);
}
-
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- // Assumption: the sets are all single code points.
- leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
- leadCodePoints->addAll(*separatorSet);
- if (!fLocalDigitStrings.isNull()) {
- for (int i = 0; i < 10; i++) {
- utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
- }
+ if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
+ return true;
+ }
+ if (fLocalDigitStrings.isNull()) {
+ return false;
+ }
+ for (int i = 0; i < 10; i++) {
+ if (segment.startsWith(fLocalDigitStrings[i])) {
+ return true;
}
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
- return *fLocalLeadCodePoints;
+ return false;
}
UnicodeString DecimalMatcher::toString() const {
bool
match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, UErrorCode& status) const;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString,
parse_flags_t parseFlags, UErrorCode& status) {
- LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags, true));
+ LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
DecimalFormatSymbols symbols(locale, status);
parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
}
IgnorablesMatcher ignorables(isStrict ? unisets::DEFAULT_IGNORABLES : unisets::STRICT_IGNORABLES);
- LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags, status));
+ LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
//////////////////////
/// AFFIX MATCHERS ///
return parser.orphan();
}
-NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags, bool computeLeads)
- : fParseFlags(parseFlags), fComputeLeads(computeLeads) {
+NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags)
+ : fParseFlags(parseFlags) {
}
NumberParserImpl::~NumberParserImpl() {
- if (fComputeLeads) {
- for (int32_t i = 0; i < fNumMatchers; i++) {
- delete (fLeads[i]);
- }
- }
fNumMatchers = 0;
}
void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
if (fNumMatchers + 1 > fMatchers.getCapacity()) {
fMatchers.resize(fNumMatchers * 2, fNumMatchers);
- if (fComputeLeads) {
- // The two arrays should grow in tandem:
- U_ASSERT(fNumMatchers >= fLeads.getCapacity());
- fLeads.resize(fNumMatchers * 2, fNumMatchers);
- }
}
-
fMatchers[fNumMatchers] = &matcher;
-
- if (fComputeLeads) {
- addLeadCodePointsForMatcher(matcher);
- }
-
fNumMatchers++;
}
-void NumberParserImpl::addLeadCodePointsForMatcher(NumberParseMatcher& matcher) {
- const UnicodeSet& leadCodePoints = matcher.getLeadCodePoints();
- // TODO: Avoid the clone operation here.
- if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) {
- auto* copy = dynamic_cast<UnicodeSet*>(leadCodePoints.cloneAsThawed());
- copy->closeOver(USET_ADD_CASE_MAPPINGS);
- copy->freeze();
- fLeads[fNumMatchers] = copy;
- } else {
- // FIXME: new here because we still take ownership
- fLeads[fNumMatchers] = new UnicodeSet(leadCodePoints);
- }
-}
-
void NumberParserImpl::freeze() {
fFrozen = true;
}
}
int initialOffset = segment.getOffset();
- int leadCp = segment.getCodePoint();
for (int32_t i = 0; i < fNumMatchers; i++) {
- if (fComputeLeads && !fLeads[i]->contains(leadCp)) {
+ const NumberParseMatcher* matcher = fMatchers[i];
+ if (!matcher->smokeTest(segment)) {
continue;
}
- const NumberParseMatcher* matcher = fMatchers[i];
matcher->match(segment, result, status);
if (U_FAILURE(status)) {
return;
int initialOffset = segment.getOffset();
for (int32_t i = 0; i < fNumMatchers; i++) {
- // TODO: Check leadChars here?
const NumberParseMatcher* matcher = fMatchers[i];
+ if (!matcher->smokeTest(segment)) {
+ continue;
+ }
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
const number::impl::DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols,
bool parseCurrency, bool optimize, UErrorCode& status);
+ /**
+ * Does NOT take ownership of the matcher. The matcher MUST remain valid for the lifespan of the
+ * NumberParserImpl.
+ * @param matcher The matcher to reference.
+ */
void addMatcher(NumberParseMatcher& matcher) override;
void freeze();
int32_t fNumMatchers = 0;
// NOTE: The stack capacity for fMatchers and fLeads should be the same
MaybeStackArray<const NumberParseMatcher*, 10> fMatchers;
- MaybeStackArray<const UnicodeSet*, 10> fLeads;
- bool fComputeLeads;
bool fFrozen = false;
// WARNING: All of these matchers start in an undefined state (default-constructed).
RequireNumberValidator number;
} fLocalValidators;
- NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
-
- void addLeadCodePointsForMatcher(NumberParseMatcher& matcher);
+ explicit NumberParserImpl(parse_flags_t parseFlags);
void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
// Allow a sign, and then try to match digits.
int8_t exponentSign = 1;
- if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) {
+ if (segment.startsWith(*unisets::get(unisets::MINUS_SIGN))) {
exponentSign = -1;
segment.adjustOffsetByCodePoint();
- } else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) {
+ } else if (segment.startsWith(*unisets::get(unisets::PLUS_SIGN))) {
segment.adjustOffsetByCodePoint();
}
return false;
}
-const UnicodeSet& ScientificMatcher::getLeadCodePoints() {
- UChar32 leadCp = fExponentSeparatorString.char32At(0);
- const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
- if (s->contains(leadCp)) {
- return *s;
- }
-
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- leadCodePoints->add(leadCp);
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool ScientificMatcher::smokeTest(const StringSegment& segment) const {
+ return segment.startsWith(fExponentSeparatorString);
}
UnicodeString ScientificMatcher::toString() const {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
}
}
-bool StringSegment::matches(UChar32 otherCp) const {
+bool StringSegment::startsWith(UChar32 otherCp) const {
return codePointsEqual(getCodePoint(), otherCp, fFoldCase);
}
-bool StringSegment::matches(const UnicodeSet& uniset) const {
+bool StringSegment::startsWith(const UnicodeSet& uniset) const {
// TODO: Move UnicodeSet case-folding logic here.
// TODO: Handle string matches here instead of separately.
UChar32 cp = getCodePoint();
return uniset.contains(cp);
}
+bool StringSegment::startsWith(const UnicodeString& other) const {
+ if (other.isBogus() || other.length() == 0 || length() == 0) {
+ return false;
+ }
+ int cp1 = getCodePoint();
+ int cp2 = other.char32At(0);
+ return codePointsEqual(cp1, cp2, fFoldCase);
+}
+
int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) {
return getPrefixLengthInternal(other, fFoldCase);
}
return overlap == segment.length();
}
-const UnicodeSet& SymbolMatcher::getLeadCodePoints() {
- if (fString.isEmpty()) {
- // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
- return *fUniSet;
- }
-
- if (fLocalLeadCodePoints.isNull()) {
- auto* leadCodePoints = new UnicodeSet();
- utils::putLeadCodePoints(fUniSet, leadCodePoints);
- utils::putLeadCodePoint(fString, leadCodePoints);
- leadCodePoints->freeze();
- fLocalLeadCodePoints.adoptInstead(leadCodePoints);
- }
- return *fLocalLeadCodePoints;
+bool SymbolMatcher::smokeTest(const StringSegment& segment) const {
+ return segment.startsWith(*fUniSet) || segment.startsWith(fString);
}
UnicodeString SymbolMatcher::toString() const {
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::EMPTY) {
}
-const UnicodeSet& NanMatcher::getLeadCodePoints() {
- // Overriding this here to allow use of statically allocated sets
- int leadCp = fString.char32At(0);
- const UnicodeSet* s = unisets::get(unisets::NAN_LEAD);
- if (s->contains(leadCp)) {
- return *s;
- }
-
- return SymbolMatcher::getLeadCodePoints();
-}
-
bool NanMatcher::isDisabled(const ParsedNumber& result) const {
return result.seenNumber();
}
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
- const UnicodeSet& getLeadCodePoints() override;
+ bool smokeTest(const StringSegment& segment) const override;
UnicodeString toString() const override;
NanMatcher(const DecimalFormatSymbols& dfs);
- const UnicodeSet& getLeadCodePoints() override;
-
protected:
bool isDisabled(const ParsedNumber& result) const override;
FLAG_PERCENT = 0x0002,
FLAG_PERMILLE = 0x0004,
FLAG_HAS_EXPONENT = 0x0008,
- FLAG_HAS_DEFAULT_CURRENCY = 0x0010,
+ // FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used
FLAG_HAS_DECIMAL_SEPARATOR = 0x0020,
FLAG_NAN = 0x0040,
FLAG_INFINITY = 0x0080,
PARSE_FLAG_USE_FULL_AFFIXES = 0x0100,
PARSE_FLAG_EXACT_AFFIX = 0x0200,
PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400,
+ PARSE_FLAG_OPTIMIZE = 0x0800,
};
* <p>
* This method will perform case folding if case folding is enabled for the parser.
*/
- bool matches(UChar32 otherCp) const;
+ bool startsWith(UChar32 otherCp) const;
/**
* Returns true if the first code point of this StringSegment is in the given UnicodeSet.
*/
- bool matches(const UnicodeSet& uniset) const;
+ bool startsWith(const UnicodeSet& uniset) const;
+
+ /**
+ * Returns true if there is at least one code point of overlap between this StringSegment and the
+ * given UnicodeString.
+ */
+ bool startsWith(const UnicodeString& other) const;
/**
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0;
/**
- * Should return a set representing all possible chars (UTF-16 code units) that could be the first
- * char that this matcher can consume. This method is only called during construction phase, and its
- * return value is used to skip this matcher unless a segment begins with a char in this set. To make
- * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
+ * Performs a fast "smoke check" for whether or not this matcher could possibly match against the
+ * given string segment. The test should be as fast as possible but also as restrictive as possible.
+ * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a
+ * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly
+ * handle case folding.
*
- * The returned UnicodeSet does not need adoption and is guaranteed to be alive for as long as the
- * object that returned it.
- *
- * This method is NOT thread-safe.
+ * @param segment
+ * The segment to check against.
+ * @return true if the matcher might be able to match against this segment; false if it definitely
+ * will not be able to match.
*/
- virtual const UnicodeSet& getLeadCodePoints() = 0;
+ virtual bool smokeTest(const StringSegment& segment) const = 0;
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
protected:
// No construction except by subclasses!
NumberParseMatcher() = default;
-
- // Optional ownership of the leadCodePoints set
- LocalPointer<const UnicodeSet> fLocalLeadCodePoints;
};
gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
- gUnicodeSets[NAN_LEAD] = new UnicodeSet(
- u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status);
- gUnicodeSets[SCIENTIFIC_LEAD] = new UnicodeSet(u"[Ee×·е\u0627]", status);
gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status);
gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
// Other
DIGITS,
- NAN_LEAD,
- SCIENTIFIC_LEAD,
CWCF,
// Combined Separators with Digits (for lead code points)
return false;
}
- const UnicodeSet& getLeadCodePoints() U_OVERRIDE {
+ bool smokeTest(const StringSegment&) const U_OVERRIDE {
// No-op
- return *unisets::get(unisets::EMPTY);
+ return false;
}
- virtual void postProcess(ParsedNumber& result) const U_OVERRIDE = 0;
+ void postProcess(ParsedNumber& result) const U_OVERRIDE = 0;
};
matchers[4] = &m4;
ArraySeriesMatcher series(matchers, 5);
- assertEquals(
- "Lead set should be equal to lead set of lead matcher",
- *unisets::get(unisets::PLUS_SIGN),
- series.getLeadCodePoints());
+ assertFalse("", series.smokeTest(StringSegment(u"x", false)));
+ assertFalse("", series.smokeTest(StringSegment(u"-", false)));
+ assertTrue("", series.smokeTest(StringSegment(u"+", false)));
static const struct TestCase {
const char16_t* input;
const UnicodeSet &percent = *get(unisets::PERCENT_SIGN);
const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN);
const UnicodeSet &infinity = *get(unisets::INFINITY_KEY);
- const UnicodeSet &nanLead = *get(unisets::NAN_LEAD);
- const UnicodeSet &scientificLead = *get(unisets::SCIENTIFIC_LEAD);
int32_t localeCount;
const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount);
ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol));
ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol));
ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol));
- ASSERT_IN_SET(nanLead, dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0));
- ASSERT_IN_SET(nanLead,
- u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0), 0));
- ASSERT_IN_SET(scientificLead,
- u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol).char32At(0), 0));
}
}
return uniset.contains(cp);
}
+ /**
+ * Returns true if there is at least one code point of overlap between this StringSegment and the
+ * given CharSequence. Null-safe.
+ */
+ public boolean startsWith(CharSequence other) {
+ if (other == null || other.length() == 0 || length() == 0) {
+ return false;
+ }
+ int cp1 = Character.codePointAt(this, 0);
+ int cp2 = Character.codePointAt(other, 0);
+ return codePointsEqual(cp1, cp2, foldCase);
+ }
+
/**
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
* example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
import com.ibm.icu.impl.number.AffixUtils;
import com.ibm.icu.impl.number.PatternStringUtils;
import com.ibm.icu.number.NumberFormatter.SignDisplay;
-import com.ibm.icu.text.UnicodeSet;
/**
* @author sffc
}
@Override
- public UnicodeSet getLeadCodePoints() {
- UnicodeSet leadCodePoints = new UnicodeSet();
- if (prefix != null) {
- leadCodePoints.addAll(prefix.getLeadCodePoints());
- }
- if (suffix != null) {
- leadCodePoints.addAll(suffix.getLeadCodePoints());
- }
- return leadCodePoints.freeze();
+ public boolean smokeTest(StringSegment segment) {
+ return (prefix != null && prefix.smokeTest(segment))
+ || (suffix != null && suffix.smokeTest(segment));
}
@Override
import java.util.List;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
/**
* Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
}
@Override
- public UnicodeSet getLeadCodePoints() {
+ public boolean smokeTest(StringSegment segment) {
assert frozen;
if (matchers == null) {
- return UnicodeSet.EMPTY;
- }
-
- if (matchers.size() == 1) {
- return matchers.get(0).getLeadCodePoints();
+ return false;
}
- UnicodeSet leadCodePoints = new UnicodeSet();
for (int i = 0; i < matchers.size(); i++) {
- NumberParseMatcher matcher = matchers.get(i);
- leadCodePoints.addAll(matcher.getLeadCodePoints());
+ if (matchers.get(i).smokeTest(segment)) {
+ return true;
+ }
}
- return leadCodePoints.freeze();
+ return false;
}
@Override
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
/**
* Matches a single code point, performing no other logic.
}
@Override
- public UnicodeSet getLeadCodePoints() {
- return new UnicodeSet().add(cp).freeze();
+ public boolean smokeTest(StringSegment segment) {
+ return segment.startsWith(cp);
}
@Override
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.ULocale;
}
@Override
- public UnicodeSet getLeadCodePoints() {
- UnicodeSet leadCodePoints = new UnicodeSet();
- ParsingUtils.putLeadCodePoint(currency1, leadCodePoints);
- ParsingUtils.putLeadCodePoint(currency2, leadCodePoints);
- return leadCodePoints.freeze();
+ public boolean smokeTest(StringSegment segment) {
+ return segment.startsWith(currency1) || segment.startsWith(currency2);
}
@Override
private final TextTrieMap<CurrencyStringInfo> longNameTrie;
private final TextTrieMap<CurrencyStringInfo> symbolTrie;
+ private final UnicodeSet leadCodePoints;
+
public static CurrencyNamesMatcher getInstance(ULocale locale) {
// TODO: Pre-compute some of the more popular locales?
return new CurrencyNamesMatcher(locale);
// case folding on long-names but not symbols.
longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
+
+ // Compute the full set of characters that could be the first in a currency to allow for
+ // efficient smoke test.
+ leadCodePoints = new UnicodeSet();
+ longNameTrie.putLeadCodePoints(leadCodePoints);
+ symbolTrie.putLeadCodePoints(leadCodePoints);
+ // Always apply case mapping closure for currencies
+ leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
+ leadCodePoints.freeze();
}
@Override
}
@Override
- public UnicodeSet getLeadCodePoints() {
- UnicodeSet leadCodePoints = new UnicodeSet();
- longNameTrie.putLeadCodePoints(leadCodePoints);
- symbolTrie.putLeadCodePoints(leadCodePoints);
- // Always apply case mapping closure for currencies
- leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
- return leadCodePoints.freeze();
+ public boolean smokeTest(StringSegment segment) {
+ return segment.startsWith(leadCodePoints);
}
@Override
}
@Override
- public UnicodeSet getLeadCodePoints() {
+ public boolean smokeTest(StringSegment segment) {
+ // The common case uses a static leadSet for efficiency.
if (digitStrings == null && leadSet != null) {
- return leadSet;
+ return segment.startsWith(leadSet);
}
-
- UnicodeSet leadCodePoints = new UnicodeSet();
- // Assumption: the sets are all single code points.
- leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS));
- leadCodePoints.addAll(separatorSet);
- if (digitStrings != null) {
- for (int i = 0; i < digitStrings.length; i++) {
- ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints);
+ if (segment.startsWith(separatorSet) || UCharacter.isDigit(segment.getCodePoint())) {
+ return true;
+ }
+ if (digitStrings == null) {
+ return false;
+ }
+ for (int i = 0; i < digitStrings.length; i++) {
+ if (segment.startsWith(digitStrings[i])) {
+ return true;
}
}
- return leadCodePoints.freeze();
+ return false;
}
@Override
super(symbolString, UnicodeSet.EMPTY);
}
- @Override
- public UnicodeSet getLeadCodePoints() {
- // Overriding this here to allow use of statically allocated sets
- int leadCp = string.codePointAt(0);
- UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.NAN_LEAD);
- if (s.contains(leadCp)) {
- return s;
- } else {
- return super.getLeadCodePoints();
- }
- }
-
@Override
protected boolean isDisabled(ParsedNumber result) {
return result.seenNumber();
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
/**
* The core interface implemented by all matchers used for number parsing.
public boolean match(StringSegment segment, ParsedNumber result);
/**
- * Should return a set representing all possible chars (UTF-16 code units) that could be the first
- * char that this matcher can consume. This method is only called during construction phase, and its
- * return value is used to skip this matcher unless a segment begins with a char in this set. To make
- * this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
+ * Performs a fast "smoke check" for whether or not this matcher could possibly match against the
+ * given string segment. The test should be as fast as possible but also as restrictive as possible.
+ * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a
+ * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly
+ * handle case folding.
+ *
+ * @param segment
+ * The segment to check against.
+ * @return true if the matcher might be able to match against this segment; false if it definitely
+ * will not be able to match.
*/
- public UnicodeSet getLeadCodePoints();
+ public boolean smokeTest(StringSegment segment);
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
import com.ibm.icu.impl.number.RoundingUtils;
import com.ibm.icu.number.NumberFormatter.GroupingStrategy;
import com.ibm.icu.text.DecimalFormatSymbols;
-import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.CurrencyAmount;
import com.ibm.icu.util.ULocale;
private final int parseFlags;
private final List<NumberParseMatcher> matchers;
- private final List<UnicodeSet> leads;
private boolean frozen;
/**
*/
public NumberParserImpl(int parseFlags) {
matchers = new ArrayList<NumberParseMatcher>();
- if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_OPTIMIZE)) {
- leads = new ArrayList<UnicodeSet>();
- } else {
- leads = null;
- }
this.parseFlags = parseFlags;
frozen = false;
}
public void addMatcher(NumberParseMatcher matcher) {
assert !frozen;
this.matchers.add(matcher);
- if (leads != null) {
- addLeadCodePointsForMatcher(matcher);
- }
}
public void addMatchers(Collection<? extends NumberParseMatcher> matchers) {
assert !frozen;
this.matchers.addAll(matchers);
- if (leads != null) {
- for (NumberParseMatcher matcher : matchers) {
- addLeadCodePointsForMatcher(matcher);
- }
- }
- }
-
- private void addLeadCodePointsForMatcher(NumberParseMatcher matcher) {
- UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
- assert leadCodePoints.isFrozen();
- // TODO: Avoid the clone operation here.
- if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE)) {
- leadCodePoints = leadCodePoints.cloneAsThawed().closeOver(UnicodeSet.ADD_CASE_MAPPINGS)
- .freeze();
- }
- this.leads.add(leadCodePoints);
}
public void freeze() {
}
int initialOffset = segment.getOffset();
- int leadCp = segment.getCodePoint();
for (int i = 0; i < matchers.size(); i++) {
- if (leads != null && !leads.get(i).contains(leadCp)) {
+ NumberParseMatcher matcher = matchers.get(i);
+ if (!matcher.smokeTest(segment)) {
continue;
}
- NumberParseMatcher matcher = matchers.get(i);
matcher.match(segment, result);
if (segment.getOffset() != initialOffset) {
// In a greedy parse, recurse on only the first match.
int initialOffset = segment.getOffset();
for (int i = 0; i < matchers.size(); i++) {
- // TODO: Check leadChars here?
NumberParseMatcher matcher = matchers.get(i);
+ if (!matcher.smokeTest(segment)) {
+ continue;
+ }
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int charsToConsume = 0; charsToConsume < segment.length();) {
import com.ibm.icu.impl.StringSegment;
import com.ibm.icu.impl.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
-import com.ibm.icu.text.UnicodeSet;
/**
* @author sffc
}
@Override
- public UnicodeSet getLeadCodePoints() {
- int leadCp = exponentSeparatorString.codePointAt(0);
- UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.SCIENTIFIC_LEAD);
- if (s.contains(leadCp)) {
- return s;
- } else {
- return new UnicodeSet().add(leadCp).freeze();
- }
+ public boolean smokeTest(StringSegment segment) {
+ return segment.startsWith(exponentSeparatorString);
}
@Override
import java.util.List;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
/**
* Composes a number of matchers, running one after another. Matches the input string only if all of the
}
@Override
- public UnicodeSet getLeadCodePoints() {
+ public boolean smokeTest(StringSegment segment) {
assert frozen;
if (matchers == null) {
- return UnicodeSet.EMPTY;
+ return false;
}
// SeriesMatchers are never allowed to start with a Flexible matcher.
assert !(matchers.get(0) instanceof NumberParseMatcher.Flexible);
- return matchers.get(0).getLeadCodePoints();
+ return matchers.get(0).smokeTest(segment);
}
@Override
}
@Override
- public UnicodeSet getLeadCodePoints() {
- if (string.isEmpty()) {
- // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
- return uniSet;
- }
-
- UnicodeSet leadCodePoints = new UnicodeSet();
- ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
- ParsingUtils.putLeadCodePoint(string, leadCodePoints);
- return leadCodePoints.freeze();
+ public boolean smokeTest(StringSegment segment) {
+ return segment.startsWith(uniSet) || segment.startsWith(string);
}
@Override
// Other
DIGITS,
- NAN_LEAD,
- SCIENTIFIC_LEAD,
CWCF, // TODO: Check if this is being used and remove it if not.
// Combined Separators with Digits (for lead code points)
unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
- unicodeSets.put(Key.NAN_LEAD,
- new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]")
- .freeze());
- unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze());
unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());
unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.impl.StringSegment;
-import com.ibm.icu.text.UnicodeSet;
/**
* A Matcher used only for post-process validation, not for consuming characters at runtime.
}
@Override
- public UnicodeSet getLeadCodePoints() {
- return UnicodeSet.EMPTY;
+ public boolean smokeTest(StringSegment segment) {
+ return false;
}
}
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.dev.test.number;
+import static com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.get;
+
import java.math.BigDecimal;
import java.util.Random;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
+import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
+import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.number.NumberFormatter;
import com.ibm.icu.number.Rounder;
+import com.ibm.icu.text.DecimalFormatSymbols;
+import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
/**
org.junit.Assume.assumeTrue(getExhaustiveness() > 5);
}
+ @Test
+ public void testSetCoverage() {
+ // Lenient comma/period should be supersets of strict comma/period;
+ // it also makes the coverage logic cheaper.
+ assertTrue("COMMA should be superset of STRICT_COMMA",
+ get(Key.COMMA).containsAll(get(Key.STRICT_COMMA)));
+ assertTrue("PERIOD should be superset of STRICT_PERIOD",
+ get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD)));
+
+ UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD))
+ .freeze();
+ UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS))
+ .freeze();
+ UnicodeSet plusSign = get(Key.PLUS_SIGN);
+ UnicodeSet minusSign = get(Key.MINUS_SIGN);
+ UnicodeSet percent = get(Key.PERCENT_SIGN);
+ UnicodeSet permille = get(Key.PERMILLE_SIGN);
+ UnicodeSet infinity = get(Key.INFINITY);
+
+ for (ULocale locale : ULocale.getAvailableLocales()) {
+ DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
+
+ assertInSet(locale, decimals, dfs.getDecimalSeparatorString());
+ assertInSet(locale, grouping, dfs.getGroupingSeparatorString());
+ assertInSet(locale, plusSign, dfs.getPlusSignString());
+ assertInSet(locale, minusSign, dfs.getMinusSignString());
+ assertInSet(locale, percent, dfs.getPercentString());
+ assertInSet(locale, permille, dfs.getPerMillString());
+ assertInSet(locale, infinity, dfs.getInfinity());
+ }
+ }
+
+ static void assertInSet(ULocale locale, UnicodeSet set, String str) {
+ if (str.codePointCount(0, str.length()) != 1) {
+ // Ignore locale strings with more than one code point (usually a bidi mark)
+ return;
+ }
+ assertInSet(locale, set, str.codePointAt(0));
+ }
+
+ static void assertInSet(ULocale locale, UnicodeSet set, int cp) {
+ // If this test case fails, add the specified code point to the corresponding set in
+ // UnicodeSetStaticCache.java and numparse_unisets.cpp
+ assertTrue(
+ locale
+ + " U+"
+ + Integer.toHexString(cp)
+ + " ("
+ + UCharacter.toString(cp)
+ + ") should be in "
+ + set,
+ set.contains(cp));
+ }
+
@Test
public void unlimitedRoundingBigDecimal() {
BigDecimal ten10000 = BigDecimal.valueOf(10).pow(10000);
package com.ibm.icu.dev.test.number;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
import com.ibm.icu.impl.number.parse.PercentMatcher;
import com.ibm.icu.impl.number.parse.PlusSignMatcher;
import com.ibm.icu.impl.number.parse.SeriesMatcher;
-import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache;
-import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.ULocale;
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.freeze();
- assertEquals(UnicodeSetStaticCache.get(Key.PLUS_SIGN), series.getLeadCodePoints());
+ assertFalse(series.smokeTest(new StringSegment("x", false)));
+ assertFalse(series.smokeTest(new StringSegment("-", false)));
+ assertTrue(series.smokeTest(new StringSegment("+", false)));
Object[][] cases = new Object[][] {
{ "", 0, true },
import com.ibm.icu.dev.test.serializable.SerializableTestUtility;
import com.ibm.icu.impl.number.DecimalFormatProperties;
+import com.ibm.icu.impl.number.DecimalFormatProperties.ParseMode;
import com.ibm.icu.impl.number.Padder.PadPosition;
import com.ibm.icu.impl.number.PatternStringParser;
-import com.ibm.icu.impl.number.parse.NumberParserImpl.ParseMode;
import com.ibm.icu.text.CompactDecimalFormat.CompactStyle;
import com.ibm.icu.text.CurrencyPluralInfo;
import com.ibm.icu.text.MeasureFormat.FormatWidth;
import org.junit.Test;
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.DecimalFormatSymbols;
-import com.ibm.icu.text.UnicodeSet;
-import com.ibm.icu.util.ULocale;
/**
+ * This test class is thin; most of it was moved to ExhaustiveNumberTest.
* @author sffc
- *
*/
public class UnicodeSetStaticCacheTest {
- @Test
- public void testSetCoverage() {
- // Lenient comma/period should be supersets of strict comma/period;
- // it also makes the coverage logic cheaper.
- assertTrue("COMMA should be superset of STRICT_COMMA",
- get(Key.COMMA).containsAll(get(Key.STRICT_COMMA)));
- assertTrue("PERIOD should be superset of STRICT_PERIOD",
- get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD)));
-
- UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD))
- .freeze();
- UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS))
- .freeze();
- UnicodeSet plusSign = get(Key.PLUS_SIGN);
- UnicodeSet minusSign = get(Key.MINUS_SIGN);
- UnicodeSet percent = get(Key.PERCENT_SIGN);
- UnicodeSet permille = get(Key.PERMILLE_SIGN);
- UnicodeSet infinity = get(Key.INFINITY);
- UnicodeSet nanLead = get(Key.NAN_LEAD);
- UnicodeSet scientificLead = get(Key.SCIENTIFIC_LEAD);
-
- for (ULocale locale : ULocale.getAvailableLocales()) {
- DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
-
- assertInSet(locale, decimals, dfs.getDecimalSeparatorString());
- assertInSet(locale, grouping, dfs.getGroupingSeparatorString());
- assertInSet(locale, plusSign, dfs.getPlusSignString());
- assertInSet(locale, minusSign, dfs.getMinusSignString());
- assertInSet(locale, percent, dfs.getPercentString());
- assertInSet(locale, permille, dfs.getPerMillString());
- assertInSet(locale, infinity, dfs.getInfinity());
- assertInSet(locale, nanLead, dfs.getNaN().codePointAt(0));
- assertInSet(locale, nanLead, UCharacter.foldCase(dfs.getNaN(), true).codePointAt(0));
- assertInSet(locale,
- scientificLead,
- UCharacter.foldCase(dfs.getExponentSeparator(), true).codePointAt(0));
- }
- }
-
@Test
public void testFrozen() {
for (Key key : Key.values()) {
assertTrue(get(key).isFrozen());
}
}
-
- static void assertInSet(ULocale locale, UnicodeSet set, String str) {
- if (str.codePointCount(0, str.length()) != 1) {
- // Ignore locale strings with more than one code point (usually a bidi mark)
- return;
- }
- assertInSet(locale, set, str.codePointAt(0));
- }
-
- static void assertInSet(ULocale locale, UnicodeSet set, int cp) {
- // If this test case fails, add the specified code point to the corresponding set in
- // UnicodeSetStaticCache.java and numparse_unisets.cpp
- assertTrue(
- locale
- + " U+"
- + Integer.toHexString(cp)
- + " ("
- + UCharacter.toString(cp)
- + ") should be in "
- + set,
- set.contains(cp));
- }
}