private Node _root = new Node();
boolean _ignoreCase;
+ public static class Output {
+ public int matchLength;
+ public boolean partialMatch;
+ }
+
/**
* Constructs a TextTrieMap object.
*
return get(text, start, null);
}
- public Iterator<V> get(CharSequence text, int start, int[] matchLen) {
+ public Iterator<V> get(CharSequence text, int start, Output output) {
LongestMatchHandler<V> handler = new LongestMatchHandler<V>();
- find(text, start, handler);
- if (matchLen != null && matchLen.length > 0) {
- matchLen[0] = handler.getMatchLength();
+ find(text, start, handler, output);
+ if (output != null) {
+ output.matchLength = handler.getMatchLength();
}
return handler.getMatches();
}
public void find(CharSequence text, ResultHandler<V> handler) {
- find(text, 0, handler);
+ find(text, 0, handler, new Output());
}
public void find(CharSequence text, int offset, ResultHandler<V> handler) {
+ find(text, offset, handler, new Output());
+ }
+
+ private void find(CharSequence text, int offset, ResultHandler<V> handler, Output output) {
CharIterator chitr = new CharIterator(text, offset, _ignoreCase);
- find(_root, chitr, handler);
+ find(_root, chitr, handler, output);
}
- private synchronized void find(Node node, CharIterator chitr, ResultHandler<V> handler) {
+ private synchronized void find(Node node, CharIterator chitr, ResultHandler<V> handler, Output output) {
Iterator<V> values = node.values();
if (values != null) {
if (!handler.handlePrefixMatch(chitr.processedLength(), values)) {
}
}
- Node nextMatch = node.findMatch(chitr);
+ Node nextMatch = node.findMatch(chitr, output);
if (nextMatch != null) {
- find(nextMatch, chitr, handler);
+ find(nextMatch, chitr, handler, output);
}
}
add(toCharArray(buf), 0, value);
}
- public Node findMatch(CharIterator chitr) {
+ public Node findMatch(CharIterator chitr, Output output) {
if (_children == null) {
return null;
}
if (!chitr.hasNext()) {
+ output.partialMatch = true;
return null;
}
Node match = null;
break;
}
if (ch == child._text[0]) {
- if (child.matchFollowing(chitr)) {
+ if (child.matchFollowing(chitr, output)) {
match = child;
}
break;
litr.add(new Node(subArray(text, offset), addValue(null, value), null));
}
- private boolean matchFollowing(CharIterator chitr) {
+ private boolean matchFollowing(CharIterator chitr, Output output) {
boolean matched = true;
int idx = 1;
while (idx < _text.length) {
if(!chitr.hasNext()) {
+ output.partialMatch = true;
matched = false;
break;
}
* convenience method {@link #needsPlurals()}.
*/
public void setSymbols(DecimalFormatSymbols symbols, Currency currency, UnitWidth unitWidth, PluralRules rules) {
- assert (rules != null) == needsPlurals();
+ //assert (rules != null) == needsPlurals();
this.symbols = symbols;
this.currency = currency;
this.unitWidth = unitWidth;
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.util.Comparator;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.ibm.icu.impl.StandardPlural;
+import com.ibm.icu.impl.number.MutablePatternModifier;
+import com.ibm.icu.impl.number.NumberStringBuilder;
+
+/**
+ * @author sffc
+ *
+ */
+public class AffixMatcher implements NumberParseMatcher {
+ private final String prefix;
+ private final String suffix;
+ private final int flags;
+
+ /**
+ * Comparator for two AffixMatcher instances which prioritizes longer prefixes followed by longer suffixes, ensuring
+ * that the longest prefix/suffix pair is always chosen.
+ */
+ public static final Comparator<AffixMatcher> COMPARATOR = new Comparator<AffixMatcher>() {
+ @Override
+ public int compare(AffixMatcher o1, AffixMatcher o2) {
+ if (o1.prefix.length() != o2.prefix.length()) {
+ return o1.prefix.length() > o2.prefix.length() ? -1 : 1;
+ } else if (o1.suffix.length() != o2.suffix.length()) {
+ return o1.suffix.length() > o2.suffix.length() ? -1 : 1;
+ } else if (!o1.equals(o2)) {
+ // If the prefix and suffix are the same length, arbitrarily break ties.
+ // We can't return zero unless the elements are equal.
+ return o1.hashCode() > o2.hashCode() ? -1 : 1;
+ } else {
+ return 0;
+ }
+ }
+ };
+
+ /**
+ * Creates multiple AffixMatchers, enough to cover the requirements for the given pattern modifier, appending them
+ * in order to the NumberParserImpl.
+ */
+ public static void generateFromPatternModifier(
+ MutablePatternModifier patternModifier,
+ int flags,
+ NumberParserImpl output) {
+
+ // Store the matchers in a TreeSet to ensure both uniqueness and order.
+ Set<AffixMatcher> matchers = new TreeSet<AffixMatcher>(COMPARATOR);
+
+ // Construct one matcher per isNegative/plural combination. Most of the time, plurals aren't needed, so only
+ // two matchers will be created, one for positive and one for negative.
+ NumberStringBuilder nsb = new NumberStringBuilder();
+ boolean isNegative = false;
+ while (true) {
+ if (isNegative) {
+ flags |= ParsedNumber.FLAG_NEGATIVE;
+ }
+
+ if (patternModifier.needsPlurals()) {
+ for (StandardPlural plural : StandardPlural.VALUES) {
+ patternModifier.setNumberProperties(isNegative, plural);
+ matchers.add(getInstance(patternModifier, flags, nsb));
+ }
+ } else {
+ patternModifier.setNumberProperties(isNegative, null);
+ matchers.add(getInstance(patternModifier, flags, nsb));
+ }
+
+ if (isNegative) {
+ break;
+ } else {
+ isNegative = true;
+ }
+ }
+
+ for (AffixMatcher matcher : matchers) {
+ output.addMatcher(matcher);
+ }
+ }
+
+ /**
+ * Constructs an AffixMatcher from the given MutablePatternModifier and flags. The NumberStringBuilder is used as a
+ * temporary object only.
+ */
+ private static AffixMatcher getInstance(
+ MutablePatternModifier patternModifier,
+ int flags,
+ NumberStringBuilder nsb) {
+ // TODO: Make this more efficient (avoid the substrings and things)
+ nsb.clear();
+ patternModifier.apply(nsb, 0, 0);
+ int prefixLength = patternModifier.getPrefixLength();
+ String full = nsb.toString();
+ String prefix = full.substring(0, prefixLength);
+ String suffix = full.substring(prefixLength);
+ return new AffixMatcher(prefix, suffix, flags);
+ }
+
+ private AffixMatcher(String prefix, String suffix, int flags) {
+ this.prefix = prefix;
+ this.suffix = suffix;
+ this.flags = flags;
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ if (result.quantity == null) {
+ // Prefix
+ if (result.prefix != null || prefix.length() == 0) {
+ return false;
+ }
+ int overlap = segment.getCommonPrefixLength(prefix);
+ if (overlap == prefix.length()) {
+ result.prefix = prefix;
+ segment.adjustOffset(overlap);
+ result.setCharsConsumed(segment);
+ return false;
+ } else if (overlap == segment.length()) {
+ return true;
+ }
+
+ } else {
+ // Suffix
+ if (result.suffix != null || suffix.length() == 0 || !prefix.equals(orEmpty(result.prefix))) {
+ return false;
+ }
+ int overlap = segment.getCommonPrefixLength(suffix);
+ if (overlap == suffix.length()) {
+ result.suffix = suffix;
+ segment.adjustOffset(overlap);
+ result.setCharsConsumed(segment);
+ return false;
+ } else if (overlap == segment.length()) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // Check to see if our affix is the one that was matched. If so, set the flags in the result.
+ if (prefix.equals(orEmpty(result.prefix)) && suffix.equals(orEmpty(result.suffix))) {
+ result.flags |= flags;
+ }
+ }
+
+ /**
+ * Returns the input string, or "" if input is null.
+ */
+ static String orEmpty(String str) {
+ return str == null ? "" : str;
+ }
+
+ /**
+ * Returns the sum of prefix and suffix length in the ParsedNumber.
+ */
+ public static int affixLength(ParsedNumber o2) {
+ return orEmpty(o2.prefix).length() + orEmpty(o2.suffix).length();
+ }
+
+ @Override
+ public boolean equals(Object _other) {
+ if (!(_other instanceof AffixMatcher)) {
+ return false;
+ }
+ AffixMatcher other = (AffixMatcher) _other;
+ return prefix.equals(other.prefix) && suffix.equals(other.suffix) && flags == other.flags;
+ }
+
+ @Override
+ public int hashCode() {
+ return prefix.hashCode() ^ suffix.hashCode() ^ flags;
+ }
+
+ @Override
+ public String toString() {
+ return "<AffixMatcher \"" + prefix + "\" \"" + suffix + "\">";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.util.Iterator;
+
+import com.ibm.icu.impl.TextTrieMap;
+import com.ibm.icu.util.Currency;
+import com.ibm.icu.util.Currency.CurrencyStringInfo;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * @author sffc
+ *
+ */
+public class CurrencyMatcher implements NumberParseMatcher {
+
+ private final TextTrieMap<CurrencyStringInfo> longNameTrie;
+ private final TextTrieMap<CurrencyStringInfo> symbolTrie;
+
+ public CurrencyMatcher(ULocale locale) {
+ longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
+ symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ if (result.currencyCode != null) {
+ return false;
+ }
+
+ TextTrieMap.Output trieOutput = new TextTrieMap.Output();
+ Iterator<CurrencyStringInfo> values = longNameTrie.get(segment, 0, trieOutput);
+ if (values == null) {
+ values = symbolTrie.get(segment, 0, trieOutput);
+ }
+ if (values != null) {
+ result.currencyCode = values.next().getISOCode();
+ segment.adjustOffset(trieOutput.matchLength);
+ result.setCharsConsumed(segment);
+ }
+ return trieOutput.partialMatch;
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // No-op
+ }
+
+ @Override
+ public String toString() {
+ return "<CurrencyMatcher>";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.math.RoundingMode;
+
+import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
+import com.ibm.icu.impl.number.RoundingUtils;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.DecimalFormatSymbols;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public class DecimalMatcher implements NumberParseMatcher {
+
+ /**
+ * @return
+ */
+ public static DecimalMatcher getInstance(DecimalFormatSymbols symbols) {
+ // TODO(sffc): Auto-generated method stub
+ return new DecimalMatcher(symbols.getDigitStrings(),
+ new UnicodeSet("[,]").freeze(),
+ new UnicodeSet("[.]").freeze(),
+ false);
+ }
+
+ public static DecimalMatcher getExponentInstance(DecimalFormatSymbols symbols) {
+ return new DecimalMatcher(symbols.getDigitStrings(),
+ new UnicodeSet("[,]").freeze(),
+ new UnicodeSet("[.]").freeze(),
+ true);
+ }
+
+ private final String[] digitStrings;
+ private final UnicodeSet groupingUniSet;
+ private final UnicodeSet decimalUniSet;
+ private final UnicodeSet separatorSet;
+ public boolean requireGroupingMatch = false;
+ private final int grouping1 = 3;
+ private final int grouping2 = 3;
+ private final boolean isScientific;
+
+ private DecimalMatcher(
+ String[] digitStrings,
+ UnicodeSet groupingUniSet,
+ UnicodeSet decimalUniSet,
+ boolean isScientific) {
+ this.digitStrings = digitStrings;
+ this.groupingUniSet = groupingUniSet;
+ this.decimalUniSet = decimalUniSet;
+ separatorSet = groupingUniSet.cloneAsThawed().addAll(decimalUniSet).freeze();
+ this.isScientific = isScientific;
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ if (result.quantity != null && !isScientific) {
+ // A number has already been consumed.
+ return false;
+ }
+
+ int currGroup = 0;
+ int separator = -1;
+ int lastSeparatorOffset = segment.getOffset();
+ boolean hasPartialPrefix = false;
+ boolean seenBothSeparators = false;
+ while (segment.length() > 0) {
+ hasPartialPrefix = false;
+
+ // Attempt to match a digit.
+ byte digit = -1;
+
+ // Try by code point digit value.
+ int cp = segment.getCodePoint();
+ if (UCharacter.isDigit(cp)) {
+ segment.adjustOffset(Character.charCount(cp));
+ digit = (byte) UCharacter.digit(cp);
+ }
+
+ // Try by digit string.
+ if (digit == -1) {
+ for (int i = 0; i < digitStrings.length; i++) {
+ String str = digitStrings[i];
+ int overlap = segment.getCommonPrefixLength(str);
+ if (overlap == str.length()) {
+ segment.adjustOffset(str.length());
+ digit = (byte) i;
+ } else if (overlap == segment.length()) {
+ hasPartialPrefix = true;
+ }
+ }
+ }
+
+ // If found, save it in the DecimalQuantity or scientific adjustment.
+ if (digit >= 0) {
+ if (isScientific) {
+ result.scientificAdjustment = digit + result.scientificAdjustment * 10;
+ } else {
+ if (result.quantity == null) {
+ result.quantity = new DecimalQuantity_DualStorageBCD();
+ }
+ result.quantity.appendDigit(digit, 0, true);
+ }
+ result.setCharsConsumed(segment);
+ currGroup++;
+ continue;
+ }
+
+ // Attempt to match a separator.
+ if (!seenBothSeparators && cp != -1 && separatorSet.contains(cp)) {
+ if (separator == -1) {
+ // First separator; could be either grouping or decimal.
+ separator = cp;
+ if (requireGroupingMatch && currGroup == 0) {
+ break;
+ }
+ } else if (separator == cp && groupingUniSet.contains(cp)) {
+ // Second or later grouping separator.
+ if (requireGroupingMatch && currGroup != grouping2) {
+ break;
+ }
+ } else if (separator != cp && decimalUniSet.contains(cp)) {
+ // Decimal separator.
+ if (requireGroupingMatch && currGroup != grouping1) {
+ break;
+ }
+ seenBothSeparators = true;
+ } else {
+ // Invalid separator.
+ break;
+ }
+ currGroup = 0;
+ lastSeparatorOffset = segment.getOffset();
+ segment.adjustOffset(Character.charCount(cp));
+ continue;
+ }
+
+ break;
+ }
+
+ if (seenBothSeparators || (separator != -1 && decimalUniSet.contains(separator))) {
+ result.quantity.adjustMagnitude(-currGroup);
+ } else if (requireGroupingMatch && separator != -1 && groupingUniSet.contains(separator)
+ && currGroup != grouping1) {
+ result.quantity.adjustMagnitude(-currGroup);
+ result.quantity.roundToMagnitude(0, RoundingUtils.mathContextUnlimited(RoundingMode.FLOOR));
+ segment.setOffset(lastSeparatorOffset);
+ }
+
+ return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // No-op
+ }
+
+ @Override
+ public String toString() {
+ return "<MantissaMatcher>";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+/**
+ * @author sffc
+ *
+ */
+public class MatcherUtils {
+ public static boolean isValidCodePoint(int cp) {
+ return Character.isValidCodePoint(cp)
+ && (Character.isSupplementaryCodePoint(cp) || !Character.isSurrogate((char) cp));
+ }
+
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public class MinusSignMatcher extends SymbolMatcher {
+
+ public MinusSignMatcher() {
+ super("-", new UnicodeSet("[-_]"));
+ }
+
+ @Override
+ protected boolean isDisabled(ParsedNumber result) {
+ return 0 != (result.flags & ParsedNumber.FLAG_NEGATIVE);
+ }
+
+ @Override
+ protected void accept(ParsedNumber result) {
+ result.flags |= ParsedNumber.FLAG_NEGATIVE;
+ }
+
+ @Override
+ public String toString() {
+ return "<MinusSignMatcher>";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+/**
+ * @author sffc
+ *
+ */
+public interface NumberParseMatcher {
+ /**
+ * Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds something
+ * interesting in the StringSegment, it should update the offset of the StringSegment corresponding to how many
+ * chars were matched.
+ *
+ * @param segment
+ * The StringSegment to match against. Matches always start at the beginning of the segment. The segment
+ * is guaranteed to contain at least one char.
+ * @param result
+ * The data structure to store results if the match succeeds.
+ * @return Whether this matcher thinks there may be more interesting chars beyond the end of the string segment.
+ */
+ public boolean match(StringSegment segment, ParsedNumber result);
+
+ /**
+ * Method called at the end of a parse, after all matchers have failed to consume any more chars. Allows a matcher
+ * to make final modifications to the result given the knowledge that no more matches are possible.
+ *
+ * @param result
+ * The data structure to store results.
+ */
+ public void postProcess(ParsedNumber result);
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import com.ibm.icu.impl.number.AffixPatternProvider;
+import com.ibm.icu.impl.number.AffixUtils;
+import com.ibm.icu.impl.number.MutablePatternModifier;
+import com.ibm.icu.impl.number.PatternStringParser;
+import com.ibm.icu.number.NumberFormatter.SignDisplay;
+import com.ibm.icu.number.NumberFormatter.UnitWidth;
+import com.ibm.icu.text.DecimalFormatSymbols;
+import com.ibm.icu.util.Currency;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Primary number parsing implementation class.
+ *
+ * @author sffc
+ *
+ */
+public class NumberParserImpl {
+ public static NumberParserImpl createParserFromPattern(String pattern) {
+ NumberParserImpl parser = new NumberParserImpl();
+ ULocale locale = ULocale.ENGLISH;
+ DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
+
+ MutablePatternModifier mod = new MutablePatternModifier(false);
+ AffixPatternProvider provider = PatternStringParser.parseToPatternInfo(pattern);
+ mod.setPatternInfo(provider);
+ mod.setPatternAttributes(SignDisplay.AUTO, false);
+ mod.setSymbols(symbols,
+ Currency.getInstance("USD"),
+ UnitWidth.FULL_NAME,
+ null);
+ int flags = 0;
+ if (provider.containsSymbolType(AffixUtils.TYPE_PERCENT)) {
+ flags |= ParsedNumber.FLAG_PERCENT;
+ }
+ if (provider.containsSymbolType(AffixUtils.TYPE_PERMILLE)) {
+ flags |= ParsedNumber.FLAG_PERMILLE;
+ }
+ AffixMatcher.generateFromPatternModifier(mod, flags, parser);
+
+ parser.addMatcher(DecimalMatcher.getInstance(symbols));
+ parser.addMatcher(WhitespaceMatcher.getInstance());
+ parser.addMatcher(new MinusSignMatcher());
+ parser.addMatcher(new ScientificMatcher(symbols));
+ parser.addMatcher(new CurrencyMatcher(locale));
+
+ parser.setComparator(new Comparator<ParsedNumber>() {
+ @Override
+ public int compare(ParsedNumber o1, ParsedNumber o2) {
+ return o1.charsConsumed - o2.charsConsumed;
+ }
+ });
+ parser.freeze();
+ return parser;
+ }
+
+ private final List<NumberParseMatcher> matchers;
+ private Comparator<ParsedNumber> comparator;
+ private boolean frozen;
+
+ public NumberParserImpl() {
+ matchers = new ArrayList<NumberParseMatcher>();
+ frozen = false;
+ }
+
+ public void addMatcher(NumberParseMatcher matcher) {
+ matchers.add(matcher);
+ }
+
+ public void setComparator(Comparator<ParsedNumber> comparator) {
+ this.comparator = comparator;
+ }
+
+ public void freeze() {
+ frozen = true;
+ }
+
+ public void parse(String input, boolean greedy, ParsedNumber result) {
+ assert frozen;
+ StringSegment segment = new StringSegment(input);
+ if (greedy) {
+ parseGreedyRecursive(segment, result);
+ } else {
+ parseLongestRecursive(segment, result);
+ }
+ for (NumberParseMatcher matcher : matchers) {
+ matcher.postProcess(result);
+ }
+ }
+
+ private void parseGreedyRecursive(StringSegment segment, ParsedNumber result) {
+ // Base Case
+ if (segment.length() == 0) {
+ return;
+ }
+
+ int initialOffset = segment.getOffset();
+ for (int i = 0; i < matchers.size(); i++) {
+ NumberParseMatcher matcher = matchers.get(i);
+ matcher.match(segment, result);
+ if (segment.getOffset() != initialOffset) {
+ // In a greedy parse, recurse on only the first match.
+ parseGreedyRecursive(segment, result);
+ // The following line resets the offset so that the StringSegment says the same across the function
+ // call boundary. Since we recurse only once, this line is not strictly necessary.
+ segment.setOffset(initialOffset);
+ return;
+ }
+ }
+
+ // NOTE: If we get here, the greedy parse completed without consuming the entire string.
+ }
+
+ private void parseLongestRecursive(StringSegment segment, ParsedNumber result) {
+ // Base Case
+ if (segment.length() == 0) {
+ return;
+ }
+
+ // TODO: Give a nice way for the matcher to reset the ParsedNumber?
+ ParsedNumber initial = new ParsedNumber();
+ initial.copyFrom(result);
+ ParsedNumber candidate = new ParsedNumber();
+
+ int initialOffset = segment.getOffset();
+ for (int i = 0; i < matchers.size(); i++) {
+ NumberParseMatcher matcher = matchers.get(i);
+ // In a non-greedy parse, we attempt all possible matches and pick the best.
+ for (int charsToConsume = 1; charsToConsume <= segment.length(); charsToConsume++) {
+ candidate.copyFrom(initial);
+
+ // Run the matcher on a segment of the current length.
+ segment.setLength(charsToConsume);
+ boolean maybeMore = matcher.match(segment, candidate);
+ segment.resetLength();
+
+ // If the entire segment was consumed, recurse.
+ if (segment.getOffset() - initialOffset == charsToConsume) {
+ parseLongestRecursive(segment, candidate);
+ if (comparator.compare(candidate, result) > 0) {
+ result.copyFrom(candidate);
+ }
+ }
+
+ // Since the segment can be re-used, reset the offset.
+ // This does not have an effect if the matcher did not consume any chars.
+ segment.setOffset(initialOffset);
+
+ // Unless the matcher wants to see the next char, continue to the next matcher.
+ if (!maybeMore) {
+ break;
+ }
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "<NumberParserImpl matchers=" + matchers.toString() + ">";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import com.ibm.icu.impl.number.DecimalQuantity;
+import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
+
+/**
+ * @author sffc
+ *
+ */
+public class ParsedNumber {
+
+ public DecimalQuantity_DualStorageBCD quantity = null;
+ public int charsConsumed = 0;
+ public int flags = 0;
+ public String prefix = null;
+ public String suffix = null;
+ public int scientificAdjustment = 0;
+ public String currencyCode = null;
+
+ public static final int FLAG_NEGATIVE = 0x0001;
+ public static final int FLAG_PERCENT = 0x0002;
+ public static final int FLAG_PERMILLE = 0x0004;
+
+ /**
+ * @param other
+ */
+ public void copyFrom(ParsedNumber other) {
+ quantity = other.quantity;
+ charsConsumed = other.charsConsumed;
+ flags = other.flags;
+ prefix = other.prefix;
+ suffix = other.suffix;
+ scientificAdjustment = other.scientificAdjustment;
+ currencyCode = other.currencyCode;
+ }
+
+ public void setCharsConsumed(StringSegment segment) {
+ charsConsumed = segment.getOffset();
+ }
+
+ public double getDouble() {
+ DecimalQuantity copy = quantity.createCopy();
+ copy.adjustMagnitude(scientificAdjustment);
+ double d = copy.toDouble();
+ if (0 != (flags & FLAG_NEGATIVE)) {
+ d = -d;
+ }
+ return d;
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import com.ibm.icu.text.DecimalFormatSymbols;
+
+/**
+ * @author sffc
+ *
+ */
+public class ScientificMatcher implements NumberParseMatcher {
+
+ private final String exponentSeparatorString;
+ private final DecimalMatcher exponentMatcher;
+
+ public ScientificMatcher(DecimalFormatSymbols symbols) {
+ exponentSeparatorString = symbols.getExponentSeparator();
+ exponentMatcher = DecimalMatcher.getExponentInstance(symbols);
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ // Only accept scientific notation after the mantissa.
+ if (result.quantity == null) {
+ return false;
+ }
+
+ // First match the scientific separator, and then match another number after it.
+ int overlap = segment.getCommonPrefixLength(exponentSeparatorString);
+ if (overlap == exponentSeparatorString.length()) {
+ // Full exponent separator match; try to match digits.
+ segment.adjustOffset(overlap);
+ int digitsOffset = segment.getOffset();
+ boolean digitsReturnValue = exponentMatcher.match(segment, result);
+ if (segment.getOffset() == digitsOffset) {
+ // No digits were matched; un-match the exponent separator.
+ segment.adjustOffset(-overlap);
+ }
+ return digitsReturnValue;
+
+ } else if (overlap == segment.length()) {
+ // Partial exponent separator match
+ return true;
+ }
+
+ // No match
+ return false;
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // No-op
+ }
+
+ @Override
+ public String toString() {
+ return "<ScientificMatcher " + exponentSeparatorString + ">";
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+/**
+ * A mutable class allowing for a String with a variable offset and length. The charAt, length, and subSequence methods
+ * all operate relative to the fixed offset into the String.
+ *
+ * @author sffc
+ */
+public class StringSegment implements CharSequence {
+ private final String str;
+ private int start;
+ private int end;
+
+ public StringSegment(String str) {
+ this.str = str;
+ this.start = 0;
+ this.end = str.length();
+ }
+
+ public int getOffset() {
+ return start;
+ }
+
+ public void setOffset(int start) {
+ assert start <= end;
+ this.start = start;
+ }
+
+ public void adjustOffset(int delta) {
+ assert start + delta >= 0;
+ assert start + delta <= end;
+ start += delta;
+ }
+
+ public void setLength(int length) {
+ assert length >= 0;
+ assert start + length <= str.length();
+ end = start + length;
+ }
+
+ public void resetLength() {
+ end = str.length();
+ }
+
+ @Override
+ public int length() {
+ return end - start;
+ }
+
+ @Override
+ public char charAt(int index) {
+ return str.charAt(index + start);
+ }
+
+ @Override
+ public CharSequence subSequence(int start, int end) {
+ throw new AssertionError(); // Never used
+ // Possible implementation:
+ // return str.subSequence(start + this.start, end + this.start);
+ }
+
+ /**
+ * Returns the first code point in the string segment, or -1 if the string starts with an invalid code point.
+ */
+ public int getCodePoint() {
+ assert start < end;
+ char lead = str.charAt(start);
+ if (Character.isHighSurrogate(lead) && start + 1 < end) {
+ return Character.toCodePoint(lead, str.charAt(start + 1));
+ } else if (Character.isSurrogate(lead)) {
+ return -1;
+ } else {
+ return lead;
+ }
+ }
+
+ /**
+ * Returns whether the segment is one char in length, and that the char is a leading surrogate.
+ */
+ public boolean isLeadingSurrogate() {
+ return (end - start == 1) && Character.isHighSurrogate(str.charAt(start));
+ }
+
+ /**
+ * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For example, if this
+ * string segment is "aab", and the char sequence is "aac", this method returns 2, since the first 2 characters are
+ * the same.
+ */
+ public int getCommonPrefixLength(CharSequence other) {
+ int offset = 0;
+ for (; offset < Math.min(length(), other.length()); offset++) {
+ if (charAt(offset) != other.charAt(offset)) {
+ break;
+ }
+ }
+ return offset;
+ }
+
+ @Override
+ public String toString() {
+ return str.substring(0, start) + "[" + str.substring(start, end) + "]" + str.substring(end);
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public abstract class SymbolMatcher implements NumberParseMatcher {
+ private final String string;
+ private final UnicodeSet uniSet;
+
+ protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
+ string = symbolString;
+ uniSet = symbolUniSet;
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ // Smoke test first; this matcher might be disabled.
+ if (isDisabled(result)) {
+ return false;
+ }
+
+ int cp = segment.getCodePoint();
+ if (cp != -1 && uniSet.contains(cp)) {
+ accept(result);
+ segment.adjustOffset(Character.charCount(cp));
+ return false;
+ }
+ int overlap = segment.getCommonPrefixLength(string);
+ if (overlap == string.length()) {
+ accept(result);
+ segment.adjustOffset(string.length());
+ return false;
+ }
+ return overlap == segment.length();
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // No-op
+ }
+
+ protected abstract boolean isDisabled(ParsedNumber result);
+
+ protected abstract void accept(ParsedNumber result);
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public class WhitespaceMatcher implements NumberParseMatcher {
+
+ // This set was decided after discussion with icu-design@. See ticket #13309.
+ // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
+ private static final UnicodeSet UNISET_WHITESPACE = new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
+
+ private static final WhitespaceMatcher INSTANCE = new WhitespaceMatcher();
+
+ public static WhitespaceMatcher getInstance() {
+ return INSTANCE;
+ }
+
+ private WhitespaceMatcher() {
+ }
+
+ @Override
+ public boolean match(StringSegment segment, ParsedNumber result) {
+ while (segment.length() > 0) {
+ int cp = segment.getCodePoint();
+ if (cp == -1 || !UNISET_WHITESPACE.contains(cp)) {
+ break;
+ }
+ segment.adjustOffset(Character.charCount(cp));
+ // Note: Do not touch the charsConsumed.
+ }
+ return segment.length() == 0 || segment.isLeadingSurrogate();
+ }
+
+ @Override
+ public void postProcess(ParsedNumber result) {
+ // No-op
+ }
+
+ @Override
+ public String toString() {
+ return "<WhitespaceMatcher>";
+ }
+}
}
}
- int[] matchLen = new int[] {0};
- Iterator<String> itr = ZONE_ID_TRIE.get(text, pos.getIndex(), matchLen);
+ TextTrieMap.Output trieOutput = new TextTrieMap.Output();
+ Iterator<String> itr = ZONE_ID_TRIE.get(text, pos.getIndex(), trieOutput);
if (itr != null) {
resolvedID = itr.next();
- pos.setIndex(pos.getIndex() + matchLen[0]);
+ pos.setIndex(pos.getIndex() + trieOutput.matchLength);
} else {
// TODO
// We many need to handle rule based custom zone ID (See ZoneMeta.parseCustomID),
}
}
- int[] matchLen = new int[] {0};
- Iterator<String> itr = SHORT_ZONE_ID_TRIE.get(text, pos.getIndex(), matchLen);
+ TextTrieMap.Output trieOutput = new TextTrieMap.Output();
+ Iterator<String> itr = SHORT_ZONE_ID_TRIE.get(text, pos.getIndex(), trieOutput);
if (itr != null) {
resolvedID = itr.next();
- pos.setIndex(pos.getIndex() + matchLen[0]);
+ pos.setIndex(pos.getIndex() + trieOutput.matchLength);
} else {
pos.setErrorIndex(pos.getIndex());
}
return isoResult;
}
+ public static TextTrieMap<CurrencyStringInfo> getParsingTrie(ULocale locale, int type) {
+ List<TextTrieMap<CurrencyStringInfo>> currencyTrieVec = getCurrencyTrieVec(locale);
+ if (type == Currency.LONG_NAME) {
+ return currencyTrieVec.get(0);
+ } else {
+ return currencyTrieVec.get(1);
+ }
+ }
+
/**
* @internal
* @deprecated This API is ICU internal only.
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.dev.test.number;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.junit.Test;
+
+import com.ibm.icu.impl.number.parse.NumberParserImpl;
+import com.ibm.icu.impl.number.parse.ParsedNumber;
+
+/**
+ * @author sffc
+ *
+ */
+public class NumberParserTest {
+ @Test
+ public void testBasic() {
+ Object[][] cases = new Object[][] {
+ // Fields:
+ // a) Flags:
+ // --- Bit 0x01 => Test greedy implementation
+ // --- Bit 0x02 => Test slow implementation
+ // b) Input string
+ // c) Pattern
+ // d) Expected chars consumed
+ // e) Expected double result
+ { 3, "51423", "0", 5, 51423. },
+ { 3, "51423x", "0", 5, 51423. },
+ { 3, " 51423", "0", 6, 51423. },
+ { 3, "51423 ", "0", 5, 51423. },
+ { 3, "𝟱𝟭𝟰𝟮𝟯", "0", 10, 51423. },
+ { 3, "𝟱𝟭𝟰𝟮𝟯x", "0", 10, 51423. },
+ { 3, " 𝟱𝟭𝟰𝟮𝟯", "0", 11, 51423. },
+ { 3, "𝟱𝟭𝟰𝟮𝟯 ", "0", 10, 51423. },
+ { 3, "𝟱𝟭,𝟰𝟮𝟯", "0", 11, 51423. },
+ { 3, "𝟳𝟴,𝟵𝟱𝟭,𝟰𝟮𝟯", "0", 18, 78951423. },
+ { 3, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "0", 18, 78951.423 },
+ { 3, "𝟳𝟴,𝟬𝟬𝟬", "0", 11, 78000. },
+ { 3, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "0", 18, 78000. },
+ { 3, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 18, 78000.023 },
+ { 3, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 11, 78. },
+ { 3, "-𝟱𝟭𝟰𝟮𝟯", "0", 11, -51423. },
+ { 3, "-𝟱𝟭𝟰𝟮𝟯-", "0", 11, -51423. },
+ { 3, "a51423US dollars", "a0¤¤¤", 16, 51423. },
+ { 3, "a 51423 US dollars", "a0¤¤¤", 18, 51423. },
+ { 3, "a 𝟱𝟭𝟰𝟮𝟯 b", "a0b", 14, 51423. },
+ { 3, "-a 𝟱𝟭𝟰𝟮𝟯 b", "a0b", 15, -51423. },
+ { 3, "a -𝟱𝟭𝟰𝟮𝟯 b", "a0b", 15, -51423. },
+ { 1, "a40b", "a0'0b'", 3, 40. }, // greedy code path thinks "40" is the number
+ { 2, "a40b", "a0'0b'", 4, 4. }, // slow code path find the suffix "0b"
+ { 3, "𝟱.𝟭𝟰𝟮E𝟯", "0", 12, 5142. },
+ { 3, "5,142.50 Canadian dollars", "0", 25, 5142.5 },
+ { 3, "0", "0", 1, 0.0 } };
+
+ for (Object[] cas : cases) {
+ int flags = (Integer) cas[0];
+ String input = (String) cas[1];
+ String pattern = (String) cas[2];
+ int expectedCharsConsumed = (Integer) cas[3];
+ double resultDouble = (Double) cas[4];
+ NumberParserImpl parser = NumberParserImpl.createParserFromPattern(pattern);
+ String message = "Input <" + input + "> Parser " + parser;
+
+ if (0 != (flags & 0x01)) {
+ // Test greedy code path
+ ParsedNumber resultObject = new ParsedNumber();
+ parser.parse(input, true, resultObject);
+ assertNotNull(message, resultObject.quantity);
+ assertEquals(message, resultDouble, resultObject.getDouble(), 0.0);
+ assertEquals(message, expectedCharsConsumed, resultObject.charsConsumed);
+ }
+
+ if (0 != (flags & 0x02)) {
+ // Test slow code path
+ ParsedNumber resultObject = new ParsedNumber();
+ parser.parse(input, false, resultObject);
+ assertNotNull(message, resultObject.quantity);
+ assertEquals(message, resultDouble, resultObject.getDouble(), 0.0);
+ assertEquals(message, expectedCharsConsumed, resultObject.charsConsumed);
+ }
+ }
+ }
+}
--- /dev/null
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.dev.test.number;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+import com.ibm.icu.impl.number.parse.StringSegment;
+
+/**
+ * @author sffc
+ *
+ */
+public class StringSegmentTest {
+ static final String SAMPLE_STRING = "📻 radio 📻";
+
+ @Test
+ public void testOffset() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertEquals(0, segment.getOffset());
+ segment.adjustOffset(3);
+ assertEquals(3, segment.getOffset());
+ segment.adjustOffset(2);
+ assertEquals(5, segment.getOffset());
+ segment.setOffset(4);
+ assertEquals(4, segment.getOffset());
+ }
+
+ @Test
+ public void testLength() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertEquals(11, segment.length());
+ segment.adjustOffset(3);
+ assertEquals(8, segment.length());
+ segment.setLength(4);
+ assertEquals(4, segment.length());
+ segment.setOffset(5);
+ assertEquals(2, segment.length());
+ segment.resetLength();
+ assertEquals(6, segment.length());
+ }
+
+ @Test
+ public void testCharAt() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertCharSequenceEquals(SAMPLE_STRING, segment);
+ segment.adjustOffset(3);
+ assertCharSequenceEquals("radio 📻", segment);
+ segment.setLength(5);
+ assertCharSequenceEquals("radio", segment);
+ }
+
+ @Test
+ public void testGetCodePoint() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertEquals(0x1F4FB, segment.getCodePoint());
+ segment.setLength(1);
+ assertEquals(-1, segment.getCodePoint());
+ segment.resetLength();
+ segment.adjustOffset(1);
+ assertEquals(-1, segment.getCodePoint());
+ segment.adjustOffset(1);
+ assertEquals(0x20, segment.getCodePoint());
+ }
+
+ @Test
+ public void testIsLeadingSurrogate() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertFalse(segment.isLeadingSurrogate());
+ segment.setLength(1);
+ assertTrue(segment.isLeadingSurrogate());
+ segment.adjustOffset(1);
+ segment.setLength(1);
+ assertFalse(segment.isLeadingSurrogate()); // trail, not lead
+ }
+
+ @Test
+ public void testCommonPrefixLength() {
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
+ assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
+ assertEquals(4, segment.getCommonPrefixLength("📻 r"));
+ assertEquals(3, segment.getCommonPrefixLength("📻 x"));
+ assertEquals(0, segment.getCommonPrefixLength("x"));
+ assertEquals(0, segment.getCommonPrefixLength(""));
+ segment.adjustOffset(3);
+ assertEquals(5, segment.getCommonPrefixLength("radio"));
+ assertEquals(2, segment.getCommonPrefixLength("rafio"));
+ assertEquals(0, segment.getCommonPrefixLength("fadio"));
+ assertEquals(0, segment.getCommonPrefixLength(""));
+ segment.setLength(3);
+ assertEquals(3, segment.getCommonPrefixLength("radio"));
+ assertEquals(2, segment.getCommonPrefixLength("rafio"));
+ assertEquals(0, segment.getCommonPrefixLength("fadio"));
+ assertEquals(0, segment.getCommonPrefixLength(""));
+ segment.resetLength();
+ segment.setOffset(11); // end of string
+ assertEquals(0, segment.getCommonPrefixLength("foo"));
+ }
+
+ private static void assertCharSequenceEquals(CharSequence a, CharSequence b) {
+ assertEquals(a.length(), b.length());
+ for (int i = 0; i < a.length(); i++) {
+ assertEquals(a.charAt(i), b.charAt(i));
+ }
+ }
+}