return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- // This is a stub implementation.
- throw new AssertionError();
- }
-
@Override
public void postProcess(ParsedNumber result) {
// Check to see if our affix is the one that was matched. If so, set the flags in the result.
@Override
public void consumeToken(int typeOrCp) {
// This is called by AffixUtils.iterateWithConsumer() for each token.
+
+ // Add an ignorables matcher between tokens except between two literals, and don't put two
+ // ignorables matchers in a row.
+ if (ignorables != null
+ && length() > 0
+ && (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp))) {
+ addMatcher(ignorables);
+ }
+
if (typeOrCp < 0) {
- // Don't add more than two ignorables matchers in a row
- if (ignorables != null
- && (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp))) {
- addMatcher(ignorables);
- }
+ // Case 1: the token is a symbol.
switch (typeOrCp) {
case AffixUtils.TYPE_MINUS_SIGN:
addMatcher(factory.minusSign());
default:
throw new AssertionError();
}
+
} else if (ignorables != null && ignorables.getSet().contains(typeOrCp)) {
- // Don't add more than two ignorables matchers in a row
- if (lastTypeOrCp < 0 || !ignorables.getSet().contains(lastTypeOrCp)) {
- addMatcher(ignorables);
- }
+ // Case 2: the token is an ignorable literal.
+ // No action necessary: the ignorables matcher has already been added.
+
} else {
- // Start of a literal: add ignorables matcher if the previous token was a symbol
- if (ignorables != null && lastTypeOrCp < 0) {
- addMatcher(ignorables);
- }
+ // Case 3: the token is a non-ignorable literal.
addMatcher(CodePointMatcher.getInstance(typeOrCp));
}
lastTypeOrCp = typeOrCp;
import com.ibm.icu.text.UnicodeSet;
/**
- * Composes a number of matchers, and succeeds if any of the matchers succeed.
+ * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
+ * the first matcher in the list to succeed.
*
* @author sffc
* @see SeriesMatcher
return false;
}
- // TODO: Give a nice way to reset ParsedNumber to avoid the copy here.
- ParsedNumber backup = new ParsedNumber();
- backup.copyFrom(result);
-
int initialOffset = segment.getOffset();
boolean maybeMore = false;
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
maybeMore = maybeMore || matcher.match(segment, result);
if (segment.getOffset() != initialOffset) {
- // Match succeeded. Return true here to be safe.
- // TODO: Better would be to run each matcher and return true only if at least one of the
- // matchers returned true.
- return true;
+ // Match succeeded.
+ // NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will
+ // accept any string starting with A. Therefore, there is no possibility that matchers
+ // later in the list may be evaluated on longer strings, and we can exit the loop here.
+ break;
}
}
return UnicodeSet.EMPTY;
}
+ if (matchers.size() == 1) {
+ return matchers.get(0).getLeadCodePoints();
+ }
+
UnicodeSet leadCodePoints = new UnicodeSet();
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- assert frozen;
- if (matchers == null) {
- return true;
- }
-
- for (int i = 0; i < matchers.size(); i++) {
- NumberParseMatcher matcher = matchers.get(i);
- if (matcher.matchesEmpty()) {
- return true;
- }
- }
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
assert frozen;
@Override
public String toString() {
- return "<SeriesMatcher " + matchers + ">";
+ return "<AnyMatcher " + matchers + ">";
}
}
return new UnicodeSet().add(cp).freeze();
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
* @author sffc
*
*/
-public class IgnorablesMatcher extends RangeMatcher {
+public class IgnorablesMatcher extends SymbolMatcher implements NumberParseMatcher.Flexible {
public static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES));
}
private IgnorablesMatcher(UnicodeSet ignorables) {
- super(ignorables);
- }
-
- @Override
- public UnicodeSet getLeadCodePoints() {
- if (this == DEFAULT) {
- return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
- } else if (this == STRICT) {
- return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
- } else {
- return super.getLeadCodePoints();
- }
+ super("", ignorables);
}
@Override
import com.ibm.icu.text.UnicodeSet;
/**
- * @author sffc
+ * Given a string, there should NOT be more than one way to consume the string with the same matcher
+ * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an
+ * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given
+ * the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A
+ * Matcher apply 4 times to each character; you could have it apply just once to all the characters; you
+ * could have it apply to the first 2 characters and the second 2 characters; and so on. A better version
+ * of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it
+ * repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface
+ * below to signal that it can be applied multiple times in a row.
*
+ * @author sffc
*/
public interface NumberParseMatcher {
+
+ /**
+ * Matchers can implement the Flexible interface to indicate that they are optional and can be run
+ * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher.
+ */
+ public interface Flexible {
+ }
+
/**
* Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds
* something interesting in the StringSegment, it should update the offset of the StringSegment
*/
public UnicodeSet getLeadCodePoints();
- /**
- * Whether this matcher is well-defined for the empty string. Matchers that are looking for specific
- * symbols should return false here. Matchers that are looking for any number of copies of a certain
- * code point or string, like RangeMatcher and IgnorablesMatcher, should return true.
- *
- * @return Whether this matcher can accept the empty string.
- */
- public boolean matchesEmpty();
-
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
* Allows a matcher to make final modifications to the result given the knowledge that no more
public class NumberParserImpl {
@Deprecated
public static NumberParserImpl createParserFromPattern(
- ULocale locale, String pattern, boolean strictGrouping) {
+ ULocale locale,
+ String pattern,
+ boolean strictGrouping) {
// Temporary frontend for testing.
int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
parser.addMatcher(InfinityMatcher.getInstance(symbols));
String padString = properties.getPadString();
if (padString != null && !ignorables.getSet().contains(padString)) {
- parser.addMatcher(new PaddingMatcher(padString));
+ parser.addMatcher(PaddingMatcher.getInstance(padString));
}
parser.addMatcher(ignorables);
parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
int initialOffset = segment.getOffset();
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
+
// In a non-greedy parse, we attempt all possible matches and pick the best.
for (int charsToConsume = 0; charsToConsume < segment.length();) {
charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
* @author sffc
*
*/
-public class PaddingMatcher extends RangeMatcher {
+public class PaddingMatcher extends SymbolMatcher implements NumberParseMatcher.Flexible {
- /**
- * @param uniSet
- */
- protected PaddingMatcher(String padString) {
- super(new UnicodeSet().add(padString).freeze());
+ public static PaddingMatcher getInstance(String padString) {
+ return new PaddingMatcher(padString);
+ }
+
+ private PaddingMatcher(String symbolString) {
+ super(symbolString, UnicodeSet.EMPTY);
}
@Override
@Override
public String toString() {
- return "<PaddingMatcher " + uniSet + ">";
+ return "<PaddingMatcher>";
}
}
+++ /dev/null
-// © 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-package com.ibm.icu.impl.number.parse;
-
-import com.ibm.icu.text.UnicodeSet;
-
-/**
- * @author sffc
- *
- */
-public abstract class RangeMatcher implements NumberParseMatcher {
- protected final UnicodeSet uniSet;
-
- protected RangeMatcher(UnicodeSet uniSet) {
- this.uniSet = uniSet;
- }
-
- public UnicodeSet getSet() {
- return uniSet;
- }
-
- @Override
- public boolean match(StringSegment segment, ParsedNumber result) {
- // Smoke test first; this matcher might be disabled.
- if (isDisabled(result)) {
- return false;
- }
-
- while (segment.length() > 0) {
- int cp = segment.getCodePoint();
- if (cp != -1 && uniSet.contains(cp)) {
- segment.adjustOffset(Character.charCount(cp));
- accept(segment, result);
- continue;
- }
-
- // If we get here, the code point didn't match the uniSet.
- return false;
- }
-
- // If we get here, we consumed the entire string segment.
- return true;
- }
-
- @Override
- public UnicodeSet getLeadCodePoints() {
- UnicodeSet leadCodePoints = new UnicodeSet();
- ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
- return leadCodePoints.freeze();
- }
-
- @Override
- public boolean matchesEmpty() {
- return true;
- }
-
- @Override
- public void postProcess(ParsedNumber result) {
- // No-op
- }
-
- protected abstract boolean isDisabled(ParsedNumber result);
-
- protected abstract void accept(StringSegment segment, ParsedNumber result);
-
-}
}
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
frozen = true;
}
+ public int length() {
+ return matchers == null ? 0 : matchers.size();
+ }
+
@Override
public boolean match(StringSegment segment, ParsedNumber result) {
assert frozen;
int initialOffset = segment.getOffset();
boolean maybeMore = true;
- for (int i = 0; i < matchers.size(); i++) {
+ for (int i = 0; i < matchers.size();) {
NumberParseMatcher matcher = matchers.get(i);
int matcherOffset = segment.getOffset();
if (segment.length() != 0) {
// Nothing for this matcher to match; ask for more.
maybeMore = true;
}
- if (segment.getOffset() == matcherOffset && !matcher.matchesEmpty()) {
- // Match failed.
+
+ boolean success = (segment.getOffset() != matcherOffset);
+ boolean isFlexible = matcher instanceof NumberParseMatcher.Flexible;
+ if (success && isFlexible) {
+ // Match succeeded, and this is a flexible matcher. Re-run it.
+ } else if (success) {
+ // Match succeeded, and this is NOT a flexible matcher. Proceed to the next matcher.
+ i++;
+ } else if (isFlexible) {
+ // Match failed, and this is a flexible matcher. Try again with the next matcher.
+ i++;
+ } else {
+ // Match failed, and this is NOT a flexible matcher. Exit.
segment.setOffset(initialOffset);
result.copyFrom(backup);
return maybeMore;
return UnicodeSet.EMPTY;
}
- if (!matchers.get(0).matchesEmpty()) {
- return matchers.get(0).getLeadCodePoints();
- }
-
- UnicodeSet leadCodePoints = new UnicodeSet();
- for (int i = 0; i < matchers.size(); i++) {
- NumberParseMatcher matcher = matchers.get(i);
- leadCodePoints.addAll(matcher.getLeadCodePoints());
- if (!matcher.matchesEmpty()) {
- break;
- }
- }
- return leadCodePoints.freeze();
- }
-
- @Override
- public boolean matchesEmpty() {
- assert frozen;
- if (matchers == null) {
- return true;
- }
-
- for (int i = 0; i < matchers.size(); i++) {
- NumberParseMatcher matcher = matchers.get(i);
- if (!matcher.matchesEmpty()) {
- return false;
- }
- }
- return true;
+ // SeriesMatchers are never allowed to start with a Flexible matcher.
+ assert !(matchers.get(0) instanceof NumberParseMatcher.Flexible);
+ return matchers.get(0).getLeadCodePoints();
}
@Override
uniSet = UnicodeSetStaticCache.get(key);
}
+ public UnicodeSet getSet() {
+ return uniSet;
+ }
+
@Override
public boolean match(StringSegment segment, ParsedNumber result) {
// Smoke test first; this matcher might be disabled.
@Override
public UnicodeSet getLeadCodePoints() {
- if (string == null || string.isEmpty()) {
+ if (string.isEmpty()) {
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
return uniSet;
}
return leadCodePoints.freeze();
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public void postProcess(ParsedNumber result) {
// No-op
return false;
}
- @Override
- public boolean matchesEmpty() {
- return false;
- }
-
@Override
public UnicodeSet getLeadCodePoints() {
return UnicodeSet.EMPTY;
y g h56 -56 JK
// S stops parsing after the 'i' for these and returns -56
// C stops before the 'i' and gets 56
-// P does not allow ignorables between the 'j' and the 'k'
56ijk -56 CJK
56i jk -56 CJK
-56ij k -56 CJKP
-56ijk -56 CJKP
+56ij k -56 CJK
+56ijk -56 CJK
56ijk -56 CJK
-56i jk -56 CJKP
+56i jk -56 CJK
56i jk -56 CJK
// S and C get 56 (accepts ' ' gs grouping); J and K get null
5 6 fail CSP
ParsePosition ppos = new ParsePosition(0);
Number result = df.parse("42\u200E%\u200E ", ppos);
assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
- // TODO: The following line breaks in ICU 61.
- //assertEquals("Should consume the trailing bidi since it is in the symbol", 5, ppos.getIndex());
+ assertEquals("Should consume the trailing bidi since it is in the symbol", 5, ppos.getIndex());
ppos.setIndex(0);
result = df.parse("-42a\u200E ", ppos);
assertEquals("Should not parse as percent", new Long(-42), result);
package com.ibm.icu.dev.test.number;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
{ 3, "📺1.23", "📺0;📻0", 6, 1.23 },
{ 3, "📻1.23", "📺0;📻0", 6, -1.23 },
{ 3, ".00", "0", 3, 0.0 },
+ { 3, " 0", "a0", 31, 0.0}, // should not hang
{ 3, "0", "0", 1, 0.0 } };
for (Object[] cas : cases) {
public void testSeriesMatcher() {
DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(ULocale.ENGLISH);
SeriesMatcher series = new SeriesMatcher();
- series.addMatcher(IgnorablesMatcher.DEFAULT);
series.addMatcher(PlusSignMatcher.getInstance(symbols));
series.addMatcher(MinusSignMatcher.getInstance(symbols));
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.freeze();
- assertEquals(UnicodeSetStaticCache.get(Key.DEFAULT_IGNORABLES).cloneAsThawed()
- .addAll(UnicodeSetStaticCache.get(Key.PLUS_SIGN)), series.getLeadCodePoints());
- assertFalse(series.matchesEmpty());
+ assertEquals(UnicodeSetStaticCache.get(Key.PLUS_SIGN), series.getLeadCodePoints());
Object[][] cases = new Object[][] {
{ "", 0, true },
- { " ", 0, true },
+ { " ", 0, false },
{ "$", 0, false },
{ "+", 0, true },
- { " +", 0, true },
- { " + ", 0, false },
+ { " +", 0, false },
{ "+-", 0, true },
{ "+ -", 0, false },
{ "+- ", 0, true },
{ "+- $", 0, false },
{ "+-%", 3, true },
- { " +- % ", 9, true },
+ { " +- % ", 0, false },
+ { "+- % ", 7, true },
{ "+-%$", 3, false } };
for (Object[] cas : cases) {
String input = (String) cas[0];