}
public void find(CharSequence text, ResultHandler<V> handler) {
- find(text, 0, handler, new Output());
+ find(text, 0, handler, null);
}
public void find(CharSequence text, int offset, ResultHandler<V> handler) {
- find(text, offset, handler, new Output());
+ find(text, offset, handler, null);
}
private void find(CharSequence text, int offset, ResultHandler<V> handler, Output output) {
}
}
- public void putLeadChars(UnicodeSet output) {
- _root.putLeadChars(output);
+ public void putLeadCodePoints(UnicodeSet output) {
+ _root.putLeadCodePoints(output);
}
/**
return null;
}
if (!chitr.hasNext()) {
- output.partialMatch = true;
+ if (output != null) {
+ output.partialMatch = true;
+ }
return null;
}
Node match = null;
return match;
}
- public void putLeadChars(UnicodeSet output) {
+ public void putLeadCodePoints(UnicodeSet output) {
if (_children == null) {
return;
}
for (Node child : _children) {
- output.add(child._text[0]);
+ char c0 = child._text[0];
+ if (!UCharacter.isHighSurrogate(c0)) {
+ output.add(c0);
+ } else if (child.charCount() >= 2) {
+ output.add(Character.codePointAt(child._text, 0));
+ } else if (child._children != null) {
+ // Construct all possible code points from grandchildren.
+ for (Node grandchild : child._children) {
+ char c1 = grandchild._text[0];
+ int cp = Character.toCodePoint(c0, c1);
+ output.add(cp);
+ }
+ }
}
}
int idx = 1;
while (idx < _text.length) {
if(!chitr.hasNext()) {
- output.partialMatch = true;
+ if (output != null) {
+ output.partialMatch = true;
+ }
matched = false;
break;
}
AffixPatternProvider patternInfo,
NumberParserImpl output,
IgnorablesMatcher ignorables,
- boolean includeUnpaired) {
+ int parseFlags) {
// Lazy-initialize the StringBuilder.
StringBuilder sb = null;
ArrayList<AffixMatcher> matchers = new ArrayList<AffixMatcher>(6);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_PREFIX, ignorables.getSet(), sb);
- String posPrefix = toStringOrEmpty(sb);
+ String posPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_SUFFIX, ignorables.getSet(), sb);
- String posSuffix = toStringOrEmpty(sb);
+ String posSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
+
+ boolean includeUnpaired = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
if (!posPrefix.isEmpty() || !posSuffix.isEmpty()) {
matchers.add(getInstance(posPrefix, posSuffix, 0));
if (patternInfo.hasNegativeSubpattern()) {
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_PREFIX, ignorables.getSet(), sb);
- String negPrefix = toStringOrEmpty(sb);
+ String negPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_SUFFIX, ignorables.getSet(), sb);
- String negSuffix = toStringOrEmpty(sb);
+ String negSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
if (negPrefix.equals(posPrefix) && negSuffix.equals(posSuffix)) {
// No-op: favor the positive AffixMatcher
}
private AffixMatcher(String prefix, String suffix, int flags) {
+ assert prefix != null;
+ assert suffix != null;
this.prefix = prefix;
this.suffix = suffix;
this.flags = flags;
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadingChar(prefix, leadChars, ignoreCase);
- ParsingUtils.putLeadingChar(suffix, leadChars, ignoreCase);
- return leadChars.freeze();
+ public UnicodeSet getLeadCodePoints() {
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ ParsingUtils.putLeadCodePoint(prefix, leadCodePoints);
+ ParsingUtils.putLeadCodePoint(suffix, leadCodePoints);
+ return leadCodePoints.freeze();
}
@Override
private final String currency1;
private final String currency2;
- public static NumberParseMatcher getInstance(Currency currency, ULocale loc) {
- return new CurrencyMatcher(currency, loc);
+ public static NumberParseMatcher getInstance(Currency currency, ULocale loc, int setupFlags) {
+ return new CurrencyMatcher(currency.getSubtype(),
+ ParsingUtils.maybeFold(currency.getSymbol(loc), setupFlags),
+ ParsingUtils.maybeFold(currency.getCurrencyCode(), setupFlags));
}
- private CurrencyMatcher(Currency currency, ULocale loc) {
- isoCode = currency.getSubtype();
- currency1 = currency.getSymbol(loc);
- currency2 = currency.getCurrencyCode();
+ private CurrencyMatcher(String isoCode, String currency1, String currency2) {
+ this.isoCode = isoCode;
+ this.currency1 = currency1;
+ this.currency2 = currency2;
}
@Override
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadingChar(currency1, leadChars, ignoreCase);
- ParsingUtils.putLeadingChar(currency2, leadChars, ignoreCase);
- return leadChars.freeze();
+ public UnicodeSet getLeadCodePoints() {
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ ParsingUtils.putLeadCodePoint(currency1, leadCodePoints);
+ ParsingUtils.putLeadCodePoint(currency2, leadCodePoints);
+ return leadCodePoints.freeze();
}
@Override
}
private CurrencyTrieMatcher(ULocale locale) {
+ // TODO: Currency trie does not currently have an option for case folding. It defaults to use
+ // case folding on long-names but not symbols.
longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
}
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- longNameTrie.putLeadChars(leadChars);
- symbolTrie.putLeadChars(leadChars);
- return leadChars.freeze();
+ public UnicodeSet getLeadCodePoints() {
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ longNameTrie.putLeadCodePoints(leadCodePoints);
+ symbolTrie.putLeadCodePoints(leadCodePoints);
+ return leadCodePoints.freeze();
}
@Override
import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
*/
public class DecimalMatcher implements NumberParseMatcher {
- public boolean requireGroupingMatch = false;
- public boolean decimalEnabled = true;
- public boolean groupingEnabled = true;
- public int grouping1 = 3;
- public int grouping2 = 3;
- public boolean integerOnly = false;
- public boolean isScientific = false;
+ private final boolean requireGroupingMatch;
+ private final boolean groupingDisabled;
+ private final int grouping1;
+ private final int grouping2;
+ private final boolean integerOnly;
+ private final boolean isScientific;
- private UnicodeSet groupingUniSet = null;
- private UnicodeSet decimalUniSet = null;
- private UnicodeSet separatorSet = null;
- private UnicodeSet separatorLeadChars = null;
- private String[] digitStrings = null;
- private boolean frozen;
+ // Assumption: these sets all consist of single code points. If this assumption needs to be broken,
+ // fix getLeadCodePoints() as well as matching logic. Be careful of the performance impact.
+ private final UnicodeSet groupingUniSet;
+ private final UnicodeSet decimalUniSet;
+ private final UnicodeSet separatorSet;
+ private final UnicodeSet leadSet;
+ private final String[] digitStrings;
- public DecimalMatcher() {
- frozen = false;
+ public static DecimalMatcher getInstance(
+ DecimalFormatSymbols symbols,
+ Grouper grouper,
+ int parseFlags) {
+ // TODO: Cache popular instances?
+ return new DecimalMatcher(symbols, grouper, parseFlags);
}
- public void freeze(DecimalFormatSymbols symbols, boolean monetarySeparators, boolean isStrict) {
- assert !frozen;
- frozen = true;
-
- String groupingSeparator = monetarySeparators ? symbols.getMonetaryGroupingSeparatorString()
- : symbols.getGroupingSeparatorString();
- String decimalSeparator = monetarySeparators ? symbols.getMonetaryDecimalSeparatorString()
- : symbols.getDecimalSeparatorString();
+ private DecimalMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
Key groupingKey, decimalKey;
+ String groupingSeparator, decimalSeparator;
+ if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS)) {
+ groupingSeparator = symbols.getMonetaryGroupingSeparatorString();
+ decimalSeparator = symbols.getMonetaryDecimalSeparatorString();
+ } else {
+ groupingSeparator = symbols.getGroupingSeparatorString();
+ decimalSeparator = symbols.getDecimalSeparatorString();
+ }
// Attempt to find values in the static cache
- if (isStrict) {
- decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
+ if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_SEPARATORS)) {
+ decimalKey = UnicodeSetStaticCache
+ .chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
if (decimalKey == Key.STRICT_COMMA) {
// Decimal is comma; grouping should be period or custom
- groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_PERIOD_OR_OTHER);
+ groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+ Key.STRICT_PERIOD_OR_OTHER);
} else if (decimalKey == Key.STRICT_PERIOD) {
// Decimal is period; grouping should be comma or custom
- groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER);
+ groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+ Key.STRICT_COMMA_OR_OTHER);
} else {
// Decimal is custom; grouping can be either comma or period or custom
- groupingKey = UnicodeSetStaticCache
- .chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER, Key.STRICT_PERIOD_OR_OTHER);
+ groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+ Key.STRICT_COMMA_OR_OTHER,
+ Key.STRICT_PERIOD_OR_OTHER);
}
} else {
decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.COMMA, Key.PERIOD);
}
// Get the sets from the static cache if they were found
+ UnicodeSet _groupingUniSet = null, _decimalUniSet = null, _separatorSet = null, _leadSet = null;
if (groupingKey != null && decimalKey != null) {
- groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
- decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+ _groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+ _decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
Key separatorKey = UnicodeSetStaticCache.unionOf(groupingKey, decimalKey);
if (separatorKey != null) {
- separatorSet = UnicodeSetStaticCache.get(separatorKey);
- separatorLeadChars = UnicodeSetStaticCache.getLeadChars(separatorKey);
+ _separatorSet = UnicodeSetStaticCache.get(separatorKey);
+ Key leadKey = UnicodeSetStaticCache.unionOf(Key.DIGITS, separatorKey);
+ if (leadKey != null) {
+ _leadSet = UnicodeSetStaticCache.get(leadKey);
+ }
}
} else if (groupingKey != null) {
- groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+ _groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
} else if (decimalKey != null) {
- decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+ _decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
}
- // Resolve fallbacks if we don't have sets from the static cache
- if (groupingUniSet == null) {
- groupingUniSet = new UnicodeSet().add(groupingSeparator).freeze();
- }
- if (decimalUniSet == null) {
- decimalUniSet = new UnicodeSet().add(decimalSeparator).freeze();
- }
- if (separatorSet == null) {
- separatorSet = new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
- }
+ // Finish resolving fallbacks
+ groupingUniSet = _groupingUniSet != null ? _groupingUniSet
+ : new UnicodeSet().add(groupingSeparator.codePointAt(0)).freeze();
+ decimalUniSet = _decimalUniSet != null ? _decimalUniSet
+ : new UnicodeSet().add(decimalSeparator.codePointAt(0)).freeze();
+ separatorSet = _separatorSet != null ? _separatorSet
+ : new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
+ leadSet = _leadSet; // null if not available
int cpZero = symbols.getCodePointZero();
if (cpZero == -1 || !UCharacter.isDigit(cpZero) || UCharacter.digit(cpZero) != 0) {
- digitStrings = symbols.getDigitStrings();
+ digitStrings = symbols.getDigitStringsLocal();
+ } else {
+ digitStrings = null;
}
+
+ requireGroupingMatch = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE);
+ groupingDisabled = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_GROUPING_DISABLED);
+ grouping1 = grouper.getPrimary();
+ grouping2 = grouper.getSecondary();
+ integerOnly = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
+ isScientific = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC);
}
@Override
}
public boolean match(StringSegment segment, ParsedNumber result, boolean negativeExponent) {
- assert frozen;
if (result.seenNumber() && !isScientific) {
// A number has already been consumed.
return false;
if (separator == -1) {
// First separator; could be either grouping or decimal.
separator = cp;
- if (groupingEnabled && requireGroupingMatch && groupingUniSet.contains(cp)
+ if (!groupingDisabled
+ && requireGroupingMatch
+ && groupingUniSet.contains(cp)
&& (currGroup == 0 || currGroup > grouping2)) {
break;
}
- } else if (groupingEnabled && separator == cp && groupingUniSet.contains(cp)) {
+ } else if (!groupingDisabled && separator == cp && groupingUniSet.contains(cp)) {
// Second or later grouping separator.
if (requireGroupingMatch && currGroup != grouping2) {
break;
}
- } else if (groupingEnabled && separator != cp && decimalUniSet.contains(cp)) {
+ } else if (!groupingDisabled && separator != cp && decimalUniSet.contains(cp)) {
// Decimal separator after a grouping separator.
if (requireGroupingMatch && currGroup != grouping1) {
break;
result.quantity.truncate();
segment.setOffset(lastSeparatorOffset);
}
- } else if (separator != -1 && !groupingEnabled) {
+ } else if (separator != -1 && groupingDisabled) {
// The final separator was a grouping separator, but we aren't accepting grouping.
// Reset the offset to immediately before that grouping separator.
result.quantity.adjustMagnitude(-currGroup);
result.quantity.truncate();
segment.setOffset(lastSeparatorOffset);
- } else if (separator != -1 && requireGroupingMatch && groupingUniSet.contains(separator)
+ } else if (separator != -1
+ && requireGroupingMatch
+ && groupingUniSet.contains(separator)
&& currGroup != grouping1) {
// The final separator was a grouping separator, and we have a mismatched grouping size.
// Reset the offset to the beginning of the number.
// segment.setOffset(initialOffset);
}
- return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
+ return segment.length() == 0 || hasPartialPrefix;
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- leadChars.addAll(UnicodeSetStaticCache.getLeadChars(Key.DIGITS));
+ public UnicodeSet getLeadCodePoints() {
+ if (digitStrings == null && leadSet != null) {
+ return leadSet;
+ }
+
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ // Assumption: the sets are all single code points.
+ leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS));
+ leadCodePoints.addAll(separatorSet);
if (digitStrings != null) {
for (int i = 0; i < digitStrings.length; i++) {
- ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+ ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints);
}
}
- if (separatorLeadChars != null) {
- leadChars.addAll(separatorLeadChars);
- } else {
- ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
- }
- return leadChars.freeze();
+ return leadCodePoints.freeze();
}
@Override
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
+ public UnicodeSet getLeadCodePoints() {
if (this == DEFAULT) {
- return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
} else if (this == STRICT) {
- return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
} else {
- return super.getLeadChars(ignoreCase);
+ return super.getLeadCodePoints();
}
}
+++ /dev/null
-// ยฉ 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-package com.ibm.icu.impl.number.parse;
-
-/**
- * @author sffc
- *
- */
-public class MatcherUtils {
- public static boolean isValidCodePoint(int cp) {
- return Character.isValidCodePoint(cp)
- && (Character.isSupplementaryCodePoint(cp) || !Character.isSurrogate((char) cp));
- }
-
-}
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
+import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
*/
public class NanMatcher extends SymbolMatcher {
- private static final NanMatcher DEFAULT = new NanMatcher();
+ private static final NanMatcher DEFAULT = new NanMatcher("NaN");
+ private static final NanMatcher DEFAULT_FOLDED = new NanMatcher(UCharacter.foldCase("NaN", true));
- public static NanMatcher getInstance(DecimalFormatSymbols symbols) {
- String symbolString = symbols.getNaN();
+ public static NanMatcher getInstance(DecimalFormatSymbols symbols, int parseFlags) {
+ String symbolString = ParsingUtils.maybeFold(symbols.getNaN(), parseFlags);
if (DEFAULT.string.equals(symbolString)) {
return DEFAULT;
+ } else if (DEFAULT_FOLDED.string.equals(symbolString)) {
+ return DEFAULT_FOLDED;
} else {
return new NanMatcher(symbolString);
}
}
private NanMatcher(String symbolString) {
- super(symbolString, DEFAULT.uniSet);
+ super(symbolString, UnicodeSet.EMPTY);
}
- private NanMatcher() {
- super("NaN", UnicodeSet.EMPTY);
+ @Override
+ public UnicodeSet getLeadCodePoints() {
+ // Overriding this here to allow use of statically allocated sets
+ if (this == DEFAULT) {
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_N);
+ } else if (this == DEFAULT_FOLDED) {
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_N);
+ } else {
+ return super.getLeadCodePoints();
+ }
}
@Override
* this matcher unless a segment begins with a char in this set. To make this matcher always run, return
* {@link UnicodeSet#ALL_CODE_POINTS}.
*/
- public UnicodeSet getLeadChars(boolean ignoreCase);
+ public UnicodeSet getLeadCodePoints();
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars. Allows a matcher
import com.ibm.icu.impl.number.DecimalFormatProperties;
import com.ibm.icu.impl.number.Parse.ParseMode;
import com.ibm.icu.impl.number.PatternStringParser;
+import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
import com.ibm.icu.impl.number.PropertiesAffixPatternProvider;
import com.ibm.icu.impl.number.RoundingUtils;
+import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Currency;
public static NumberParserImpl createParserFromPattern(String pattern, boolean strictGrouping) {
// Temporary frontend for testing.
- NumberParserImpl parser = new NumberParserImpl(true, true);
+ int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
+ | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
+ if (strictGrouping) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
+ }
+
+ NumberParserImpl parser = new NumberParserImpl(parseFlags, true);
ULocale locale = new ULocale("en_IN");
DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT;
- AffixPatternProvider patternInfo = PatternStringParser.parseToPatternInfo(pattern);
- AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, true);
+ ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern);
+ AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
+
+ Grouper grouper = Grouper.defaults().withLocaleData(patternInfo);
parser.addMatcher(ignorables);
- DecimalMatcher decimalMatcher = new DecimalMatcher();
- decimalMatcher.requireGroupingMatch = strictGrouping;
- decimalMatcher.grouping1 = 3;
- decimalMatcher.grouping2 = 2;
- decimalMatcher.freeze(symbols, false, false);
- parser.addMatcher(decimalMatcher);
+ parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
parser.addMatcher(MinusSignMatcher.getInstance(symbols));
- parser.addMatcher(new ScientificMatcher(symbols));
+ parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser.addMatcher(new RequireNumberMatcher());
currency = Currency.getInstance(result.currencyCode);
} else {
assert 0 != (result.flags & ParsedNumber.FLAG_HAS_DEFAULT_CURRENCY);
- currency = CustomSymbolCurrency.resolve(properties.getCurrency(), symbols.getULocale(), symbols);
+ currency = CustomSymbolCurrency
+ .resolve(properties.getCurrency(), symbols.getULocale(), symbols);
}
return new CurrencyAmount(result.getNumber(), currency);
} else {
DecimalFormatSymbols symbols,
boolean parseCurrency,
boolean optimize) {
- NumberParserImpl parser = new NumberParserImpl(!properties.getParseCaseSensitive(), optimize);
+
ULocale locale = symbols.getULocale();
+ AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
Currency currency = CustomSymbolCurrency.resolve(properties.getCurrency(), locale, symbols);
boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
- IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
-
boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
- ? (properties.getDecimalSeparatorAlwaysShown() || properties.getMaximumFractionDigits() != 0)
+ ? (properties.getDecimalSeparatorAlwaysShown()
+ || properties.getMaximumFractionDigits() != 0)
: false;
+ Grouper grouper = Grouper.defaults().withProperties(properties);
+ int parseFlags = 0;
+ if (!properties.getParseCaseSensitive()) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_IGNORE_CASE;
+ }
+ if (properties.getParseIntegerOnly()) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_INTEGER_ONLY;
+ }
+ if (isStrict) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
+ } else {
+ parseFlags |= ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
+ }
+ if (grouper.getPrimary() == -1) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_GROUPING_DISABLED;
+ }
+ if (parseCurrency || patternInfo.hasCurrencySign()) {
+ parseFlags |= ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS;
+ }
+ IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
+
+ NumberParserImpl parser = new NumberParserImpl(parseFlags, optimize);
//////////////////////
/// AFFIX MATCHERS ///
//////////////////////
// Set up a pattern modifier with mostly defaults to generate AffixMatchers.
- AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
- AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, !isStrict);
+ AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
////////////////////////
/// CURRENCY MATCHER ///
if (parseCurrency || patternInfo.hasCurrencySign()) {
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
- parser.addMatcher(CurrencyMatcher.getInstance(currency, locale));
+ parser.addMatcher(CurrencyMatcher.getInstance(currency, locale, parseFlags));
}
///////////////////////////////
/// OTHER STANDARD MATCHERS ///
///////////////////////////////
- if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN) || properties.getSignAlwaysShown()) {
+ if (!isStrict
+ || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN)
+ || properties.getSignAlwaysShown()) {
parser.addMatcher(PlusSignMatcher.getInstance(symbols));
}
parser.addMatcher(MinusSignMatcher.getInstance(symbols));
- parser.addMatcher(NanMatcher.getInstance(symbols));
+ parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags));
parser.addMatcher(PercentMatcher.getInstance(symbols));
parser.addMatcher(PermilleMatcher.getInstance(symbols));
parser.addMatcher(InfinityMatcher.getInstance(symbols));
parser.addMatcher(new PaddingMatcher(padString));
}
parser.addMatcher(ignorables);
- DecimalMatcher decimalMatcher = new DecimalMatcher();
- decimalMatcher.requireGroupingMatch = isStrict;
- decimalMatcher.groupingEnabled = properties.getGroupingSize() > 0;
- decimalMatcher.decimalEnabled = properties.getDecimalPatternMatchRequired() ? decimalSeparatorRequired : true;
- decimalMatcher.grouping1 = properties.getGroupingSize();
- decimalMatcher.grouping2 = properties.getSecondaryGroupingSize();
- decimalMatcher.integerOnly = properties.getParseIntegerOnly();
- decimalMatcher.freeze(symbols, parseCurrency || patternInfo.hasCurrencySign(), isStrict);
- parser.addMatcher(decimalMatcher);
+ parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
if (!properties.getParseNoExponent()) {
- parser.addMatcher(new ScientificMatcher(symbols));
+ parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
}
//////////////////
return parser;
}
- private final boolean ignoreCase;
+ private final int parseFlags;
private final List<NumberParseMatcher> matchers;
- private final List<UnicodeSet> leadCharses;
+ private final List<UnicodeSet> leadCodePointses;
private Comparator<ParsedNumber> comparator;
private boolean frozen;
* Creates a new, empty parser.
*
* @param ignoreCase
- * If true, perform case-folding. This parameter needs to go into the constructor because its value is
- * used during the construction of the matcher chain.
+ * If true, perform case-folding. This parameter needs to go into the constructor because
+ * its value is used during the construction of the matcher chain.
* @param optimize
- * If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing runtime but increases
- * construction runtime. If the parser is going to be used only once or twice, set this to false; if it
- * is going to be used hundreds of times, set it to true.
+ * If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing
+ * runtime but increases construction runtime. If the parser is going to be used only once
+ * or twice, set this to false; if it is going to be used hundreds of times, set it to
+ * true.
*/
- public NumberParserImpl(boolean ignoreCase, boolean optimize) {
+ public NumberParserImpl(int parseFlags, boolean optimize) {
matchers = new ArrayList<NumberParseMatcher>();
if (optimize) {
- leadCharses = new ArrayList<UnicodeSet>();
+ leadCodePointses = new ArrayList<UnicodeSet>();
} else {
- leadCharses = null;
+ leadCodePointses = null;
}
comparator = ParsedNumber.COMPARATOR; // default value
- this.ignoreCase = ignoreCase;
+ this.parseFlags = parseFlags;
frozen = false;
}
public void addMatcher(NumberParseMatcher matcher) {
assert !frozen;
this.matchers.add(matcher);
- if (leadCharses != null) {
- UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
- assert leadChars.isFrozen();
- this.leadCharses.add(leadChars);
+ if (leadCodePointses != null) {
+ UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
+ assert leadCodePoints.isFrozen();
+ this.leadCodePointses.add(leadCodePoints);
}
}
public void addMatchers(Collection<? extends NumberParseMatcher> matchers) {
assert !frozen;
this.matchers.addAll(matchers);
- if (leadCharses != null) {
+ if (leadCodePointses != null) {
for (NumberParseMatcher matcher : matchers) {
- UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
- assert leadChars.isFrozen();
- this.leadCharses.add(leadChars);
+ UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
+ assert leadCodePoints.isFrozen();
+ this.leadCodePointses.add(leadCodePoints);
}
}
}
* Primary entrypoint to parsing code path.
*
* @param input
- * The string to parse. This is a String, not CharSequence, to enforce assumptions about immutability
- * (CharSequences are not guaranteed to be immutable).
+ * The string to parse. This is a String, not CharSequence, to enforce assumptions about
+ * immutability (CharSequences are not guaranteed to be immutable).
* @param start
* The index into the string at which to start parsing.
* @param greedy
*/
public void parse(String input, int start, boolean greedy, ParsedNumber result) {
assert frozen;
- StringSegment segment = new StringSegment(input, ignoreCase);
+ StringSegment segment = new StringSegment(ParsingUtils.maybeFold(input, parseFlags));
segment.adjustOffset(start);
if (greedy) {
parseGreedyRecursive(segment, result);
}
int initialOffset = segment.getOffset();
- char leadChar = leadCharses == null ? 0
- : ignoreCase ? ParsingUtils.getCaseFoldedLeadingChar(segment) : segment.charAt(0);
+ int leadCp = segment.getCodePoint();
for (int i = 0; i < matchers.size(); i++) {
- if (leadCharses != null && !leadCharses.get(i).contains(leadChar)) {
+ if (leadCodePointses != null && !leadCodePointses.get(i).contains(leadCp)) {
continue;
}
NumberParseMatcher matcher = matchers.get(i);
if (segment.getOffset() != initialOffset) {
// In a greedy parse, recurse on only the first match.
parseGreedyRecursive(segment, result);
- // The following line resets the offset so that the StringSegment says the same across the function
+ // The following line resets the offset so that the StringSegment says the same across
+ // the function
// call boundary. Since we recurse only once, this line is not strictly necessary.
segment.setOffset(initialOffset);
return;
for (int i = 0; i < matchers.size(); i++) {
NumberParseMatcher matcher = matchers.get(i);
// In a non-greedy parse, we attempt all possible matches and pick the best.
- for (int charsToConsume = 1; charsToConsume <= segment.length(); charsToConsume++) {
- candidate.copyFrom(initial);
+ for (int charsToConsume = 0; charsToConsume < segment.length();) {
+ charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
// Run the matcher on a segment of the current length.
+ candidate.copyFrom(initial);
segment.setLength(charsToConsume);
boolean maybeMore = matcher.match(segment, candidate);
segment.resetLength();
package com.ibm.icu.impl.number.parse;
import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.EntryRange;
*/
public class ParsingUtils {
- /**
- * Adds all chars and lead surrogates from input into output.
- */
- public static void putLeadSurrogates(UnicodeSet input, UnicodeSet output) {
- if (input.isEmpty()) {
- return;
- }
+ public static final int PARSE_FLAG_IGNORE_CASE = 0x0001;
+ public static final int PARSE_FLAG_MONETARY_SEPARATORS = 0x0002;
+ public static final int PARSE_FLAG_STRICT_SEPARATORS = 0x0004;
+ public static final int PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008;
+ public static final int PARSE_FLAG_INTEGER_ONLY = 0x0010;
+ public static final int PARSE_FLAG_GROUPING_DISABLED = 0x0020;
+ public static final int PARSE_FLAG_DECIMAL_SCIENTIFIC = 0x0040;
+ public static final int PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080;
+
+ public static void putLeadCodePoints(UnicodeSet input, UnicodeSet output) {
for (EntryRange range : input.ranges()) {
- if (range.codepointEnd <= 0xFFFF) {
- // All BMP chars
- output.add(range.codepoint, range.codepointEnd);
- } else {
- // Need to get the lead surrogates
- // TODO: Make this more efficient?
- if (range.codepoint <= 0xFFFF) {
- output.add(range.codepoint, 0xFFFF);
- }
- for (int cp = Math.max(0x10000, range.codepoint); cp <= range.codepointEnd; cp++) {
- output.add(UTF16.getLeadSurrogate(cp));
- }
- }
+ output.add(range.codepoint, range.codepointEnd);
+ }
+ for (String str : input.strings()) {
+ output.add(str.codePointAt(0));
}
}
- /**
- * Adds the first char of the given string to leadChars, performing case-folding if necessary.
- */
- public static void putLeadingChar(String str, UnicodeSet leadChars, boolean ignoreCase) {
- if (str.isEmpty()) {
- return;
- }
- if (ignoreCase) {
- leadChars.add(getCaseFoldedLeadingChar(str));
- } else {
- leadChars.add(str.charAt(0));
+ public static void putLeadCodePoint(String input, UnicodeSet output) {
+ if (!input.isEmpty()) {
+ output.add(input.codePointAt(0));
}
}
- public static char getCaseFoldedLeadingChar(CharSequence str) {
- int cp = UCharacter.foldCase(Character.codePointAt(str, 0), true);
- if (cp <= 0xFFFF) {
- return (char) cp;
+ private static final UnicodeSet LETTERS = new UnicodeSet("[:letter:]").freeze();
+
+ /**
+ * Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string.
+ */
+ public static String maybeFold(String input, int parseFlags) {
+ if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && LETTERS.containsSome(input)) {
+ return UCharacter.foldCase(input, true);
} else {
- return UTF16.getLeadSurrogate(cp);
+ return input;
}
}
}
// If we get here, the code point didn't match the uniSet.
- return segment.isLeadingSurrogate();
+ return false;
}
// If we get here, we consumed the entire string segment.
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadSurrogates(uniSet, leadChars);
- return leadChars.freeze();
+ public UnicodeSet getLeadCodePoints() {
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
+ return leadCodePoints.freeze();
}
@Override
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
+import com.ibm.icu.number.Grouper;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.UnicodeSet;
private final String exponentSeparatorString;
private final DecimalMatcher exponentMatcher;
- public ScientificMatcher(DecimalFormatSymbols symbols) {
- exponentSeparatorString = symbols.getExponentSeparator();
- exponentMatcher = new DecimalMatcher();
- exponentMatcher.isScientific = true;
- exponentMatcher.groupingEnabled = false;
- exponentMatcher.decimalEnabled = false;
- exponentMatcher.freeze(symbols, false, false);
+ public static ScientificMatcher getInstance(
+ DecimalFormatSymbols symbols,
+ Grouper grouper,
+ int parseFlags) {
+ // TODO: Static-initialize most common instances?
+ return new ScientificMatcher(symbols, grouper, parseFlags);
+ }
+
+ private ScientificMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
+ exponentSeparatorString = ParsingUtils.maybeFold(symbols.getExponentSeparator(), parseFlags);
+ exponentMatcher = DecimalMatcher.getInstance(symbols,
+ grouper,
+ ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC | ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
}
@Override
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadingChar(exponentSeparatorString, leadChars, ignoreCase);
- return leadChars.freeze();
+ public UnicodeSet getLeadCodePoints() {
+ int cp = exponentSeparatorString.codePointAt(0);
+ if (cp == 'E') {
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_E);
+ } else if (cp == 'e') {
+ return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_E);
+ } else {
+ return new UnicodeSet().add(cp).freeze();
+ }
}
@Override
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.impl.number.parse;
-import com.ibm.icu.lang.UCharacter;
-
/**
- * A mutable class allowing for a String with a variable offset and length. The charAt, length, and subSequence methods
- * all operate relative to the fixed offset into the String.
+ * A mutable class allowing for a String with a variable offset and length. The charAt, length, and
+ * subSequence methods all operate relative to the fixed offset into the String.
*
* @author sffc
*/
private final String str;
private int start;
private int end;
- private final boolean ignoreCase;
- public StringSegment(String str, boolean ignoreCase) {
+ public StringSegment(String str) {
this.str = str;
this.start = 0;
this.end = str.length();
- this.ignoreCase = ignoreCase;
}
public int getOffset() {
}
/**
- * Returns the first code point in the string segment, or -1 if the string starts with an invalid code point.
+ * Returns the first code point in the string segment, or -1 if the string starts with an invalid
+ * code point.
*/
public int getCodePoint() {
assert start < end;
}
/**
- * Returns whether the segment is one char in length, and that the char is a leading surrogate.
- */
- public boolean isLeadingSurrogate() {
- return (end - start == 1) && Character.isHighSurrogate(str.charAt(start));
- }
-
- /**
- * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For example, if this
- * string segment is "aab", and the char sequence is "aac", this method returns 2, since the first 2 characters are
- * the same.
+ * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
+ * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
+ * since the first 2 characters are the same.
*/
public int getCommonPrefixLength(CharSequence other) {
int offset = 0;
for (; offset < Math.min(length(), other.length());) {
- if (ignoreCase) {
- // NOTE: Character.codePointAt() returns the leading surrogate if it is the only char left in the
- // string. UCharacter.foldCase() will simply return the same integer since it is not a valid code point.
- int cp1 = Character.codePointAt(this, offset);
- int cp2 = Character.codePointAt(other, offset);
- if (cp1 != cp2 && UCharacter.foldCase(cp1, true) != UCharacter.foldCase(cp2, true)) {
- break;
- }
- offset += Character.charCount(cp1);
- } else {
- // Case folding is not necessary. Use a slightly faster code path comparing chars with chars.
- if (charAt(offset) != other.charAt(offset)) {
- break;
- }
- offset++;
+ if (charAt(offset) != other.charAt(offset)) {
+ break;
}
+ offset++;
}
return offset;
}
public abstract class SymbolMatcher implements NumberParseMatcher {
protected final String string;
protected final UnicodeSet uniSet;
- protected final UnicodeSet leadChars;
// TODO: Implement this class using only UnicodeSet and not String?
// How to deal with case folding?
protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
string = symbolString;
uniSet = symbolUniSet;
- leadChars = null;
}
protected SymbolMatcher(UnicodeSetStaticCache.Key key) {
string = "";
uniSet = UnicodeSetStaticCache.get(key);
- leadChars = UnicodeSetStaticCache.getLeadChars(key);
}
@Override
}
if (string.isEmpty()) {
- return segment.isLeadingSurrogate();
+ return false;
}
int overlap = segment.getCommonPrefixLength(string);
if (overlap == string.length()) {
accept(segment, result);
return false;
}
- return overlap == segment.length() || segment.isLeadingSurrogate();
+ return overlap == segment.length();
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
- if (leadChars != null) {
- return leadChars;
+ public UnicodeSet getLeadCodePoints() {
+ if (string == null || string.isEmpty()) {
+ // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
+ return uniSet;
}
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadSurrogates(uniSet, leadChars);
- ParsingUtils.putLeadingChar(string, leadChars, ignoreCase);
- return leadChars.freeze();
+ UnicodeSet leadCodePoints = new UnicodeSet();
+ ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
+ ParsingUtils.putLeadCodePoint(string, leadCodePoints);
+ return leadCodePoints.freeze();
}
@Override
import com.ibm.icu.text.UnicodeSet;
/**
- * @author sffc
+ * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
+ * bring a very sizeable performance boost.
+ *
+ * IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and they are all case-folded.
+ * If this assumption were ever broken, logic in classes such as SymbolMatcher would need to be updated
+ * in order to return well-formed sets upon calls to getLeadCodePoints().
*
+ * @author sffc
*/
public class UnicodeSetStaticCache {
public static enum Key {
// Other
DIGITS,
+ CAPITAL_N,
+ FOLDED_N,
+ CAPITAL_E,
+ FOLDED_E,
+
+ // Combined Separators with Digits (for lead code points)
+ DIGITS_OR_COMMA_OR_OTHER,
+ DIGITS_OR_PERIOD_OR_OTHER,
+ DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
+ DIGITS_OR_STRICT_COMMA_OR_OTHER,
+ DIGITS_OR_STRICT_PERIOD_OR_OTHER,
+ DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
};
private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
- private static final Map<Key, UnicodeSet> leadCharsSets = new EnumMap<Key, UnicodeSet>(Key.class);
public static UnicodeSet get(Key key) {
return unicodeSets.get(key);
}
- public static UnicodeSet getLeadChars(Key key) {
- return leadCharsSets.get(key);
- }
-
public static Key chooseFrom(String str, Key key1) {
return get(key1).contains(str) ? key1 : null;
}
// Strict 1'234.567
return Key.STRICT_PERIOD_OR_OTHER;
+ } else if (key1 == Key.COMMA_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_COMMA_OR_OTHER;
+
+ } else if (key1 == Key.PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_PERIOD_OR_OTHER;
+
+ } else if (key1 == Key.COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER;
+
+ } else if (key1 == Key.STRICT_COMMA_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_STRICT_COMMA_OR_OTHER;
+
+ } else if (key1 == Key.STRICT_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER;
+
+ } else if (key1 == Key.STRICT_COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+ return Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER;
}
return null;
unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
- unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
- unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+ unicodeSets.put(Key.STRICT_COMMA_OR_OTHER,
+ computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
+ unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER,
+ computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER,
computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.INFINITY, new UnicodeSet("[โ]").freeze());
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
-
- for (Key key : Key.values()) {
- UnicodeSet leadChars = new UnicodeSet();
- ParsingUtils.putLeadSurrogates(get(key), leadChars);
- leadCharsSets.put(key, leadChars.freeze());
- }
+ unicodeSets.put(Key.CAPITAL_N, new UnicodeSet("[N]").freeze());
+ unicodeSets.put(Key.FOLDED_N, new UnicodeSet("[n]").freeze());
+ unicodeSets.put(Key.CAPITAL_E, new UnicodeSet("[E]").freeze());
+ unicodeSets.put(Key.FOLDED_E, new UnicodeSet("[e]").freeze());
+
+ unicodeSets.put(Key.DIGITS_OR_COMMA_OR_OTHER, computeUnion(Key.DIGITS, Key.COMMA_OR_OTHER));
+ unicodeSets.put(Key.DIGITS_OR_PERIOD_OR_OTHER, computeUnion(Key.DIGITS, Key.PERIOD_OR_OTHER));
+ unicodeSets.put(Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
+ computeUnion(Key.DIGITS, Key.COMMA_OR_PERIOD_OR_OTHER));
+ unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_OTHER,
+ computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_OTHER));
+ unicodeSets.put(Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER,
+ computeUnion(Key.DIGITS, Key.STRICT_PERIOD_OR_OTHER));
+ unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
+ computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_PERIOD_OR_OTHER));
}
}
}
@Override
- public UnicodeSet getLeadChars(boolean ignoreCase) {
+ public UnicodeSet getLeadCodePoints() {
return UnicodeSet.EMPTY;
}
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.number;
+import com.ibm.icu.impl.number.DecimalFormatProperties;
import com.ibm.icu.impl.number.DecimalQuantity;
import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
}
}
- Grouper withLocaleData(ParsedPatternInfo patternInfo) {
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public Grouper withProperties(DecimalFormatProperties properties) {
+ if (grouping1 != -2) {
+ return this;
+ }
+ byte grouping1 = (byte) properties.getGroupingSize();
+ byte grouping2 = (byte) properties.getSecondaryGroupingSize();
+ int minGrouping = properties.getMinimumGroupingDigits();
+ grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : -1;
+ grouping2 = grouping2 > 0 ? grouping2 : grouping1;
+ // TODO: Is it important to handle minGrouping > 2?
+ return getInstance(grouping1, grouping2, minGrouping == 2);
+ }
+
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public Grouper withLocaleData(ParsedPatternInfo patternInfo) {
if (grouping1 != -2) {
return this;
}
&& (position % grouping2) == 0
&& value.getUpperDisplayMagnitude() - grouping1 + 1 >= (min2 ? 2 : 1);
}
+
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public byte getPrimary() {
+ return grouping1;
+ }
+
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ @Deprecated
+ public byte getSecondary() {
+ return grouping2;
+ }
}
\ No newline at end of file
// GROUPING STRATEGY //
///////////////////////
- int grouping1 = properties.getGroupingSize();
- int grouping2 = properties.getSecondaryGroupingSize();
- int minGrouping = properties.getMinimumGroupingDigits();
- assert grouping1 >= -2; // value of -2 means to forward no grouping information
- grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : grouping1;
- grouping2 = grouping2 > 0 ? grouping2 : grouping1;
- // TODO: Is it important to handle minGrouping > 2?
- macros.grouper = Grouper.getInstance((byte) grouping1, (byte) grouping2, minGrouping == 2);
+ macros.grouper = Grouper.defaults().withProperties(properties);
/////////////
// PADDING //
{ 3, "๐ฑ๐ญ๐ฐ๐ฎ๐ฏx", "0", 10, 51423. },
{ 3, " ๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", "0", 11, 51423. },
{ 3, "๐ฑ๐ญ๐ฐ๐ฎ๐ฏ ", "0", 10, 51423. },
- { 7, "๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "0", 11, 51423. },
- { 7, "๐ณ,๐ด๐ต,๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "0", 19, 78951423. },
- { 4, "๐ณ๐ด,๐ต๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "0", 11, 78951. },
- { 7, "๐ณ๐ด,๐ต๐ฑ๐ญ.๐ฐ๐ฎ๐ฏ", "0", 18, 78951.423 },
- { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ", "0", 11, 78000. },
- { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ.๐ฌ๐ฌ๐ฌ", "0", 18, 78000. },
- { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ.๐ฌ๐ฎ๐ฏ", "0", 18, 78000.023 },
- { 7, "๐ณ๐ด.๐ฌ๐ฌ๐ฌ.๐ฌ๐ฎ๐ฏ", "0", 11, 78. },
+ { 7, "๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "#,##,##0", 11, 51423. },
+ { 7, "๐ณ,๐ด๐ต,๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "#,##,##0", 19, 78951423. },
+ { 4, "๐ณ๐ด,๐ต๐ฑ๐ญ,๐ฐ๐ฎ๐ฏ", "#,##,##0", 11, 78951. },
+ { 7, "๐ณ๐ด,๐ต๐ฑ๐ญ.๐ฐ๐ฎ๐ฏ", "#,##,##0", 18, 78951.423 },
+ { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ", "#,##,##0", 11, 78000. },
+ { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ.๐ฌ๐ฌ๐ฌ", "#,##,##0", 18, 78000. },
+ { 7, "๐ณ๐ด,๐ฌ๐ฌ๐ฌ.๐ฌ๐ฎ๐ฏ", "#,##,##0", 18, 78000.023 },
+ { 7, "๐ณ๐ด.๐ฌ๐ฌ๐ฌ.๐ฌ๐ฎ๐ฏ", "#,##,##0", 11, 78. },
{ 3, "-๐ฑ๐ญ๐ฐ๐ฎ๐ฏ", "0", 11, -51423. },
{ 3, "-๐ฑ๐ญ๐ฐ๐ฎ๐ฏ-", "0", 11, -51423. },
{ 3, "a51423US dollars", "a0ยคยคยค", 16, 51423. },
{ 3, "๐ฑ.๐ญ๐ฐ๐ฎE๐ฏ", "0", 12, 5142. },
{ 3, "๐ฑ.๐ญ๐ฐ๐ฎE-๐ฏ", "0", 13, 0.005142 },
{ 3, "๐ฑ.๐ญ๐ฐ๐ฎe-๐ฏ", "0", 13, 0.005142 },
- { 3, "5,142.50 Canadian dollars", "0", 25, 5142.5 },
+ { 7, "5,142.50 Canadian dollars", "#,##,##0", 25, 5142.5 },
// { 3, "a$ b5", "a ยค b0", 6, 5.0 }, // TODO: Does not work
- { 7, ".00", "0", 3, 0.0 },
+ { 3, "๐บ1.23", "๐บ0;๐ป0", 6, 1.23 },
+ { 3, "๐ป1.23", "๐บ0;๐ป0", 6, -1.23 },
+ { 3, ".00", "0", 3, 0.0 },
{ 3, "0", "0", 1, 0.0 } };
for (Object[] cas : cases) {
package com.ibm.icu.dev.test.number;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
import org.junit.Test;
@Test
public void testOffset() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(0, segment.getOffset());
segment.adjustOffset(3);
assertEquals(3, segment.getOffset());
@Test
public void testLength() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(11, segment.length());
segment.adjustOffset(3);
assertEquals(8, segment.length());
@Test
public void testCharAt() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
assertCharSequenceEquals(SAMPLE_STRING, segment);
segment.adjustOffset(3);
assertCharSequenceEquals("radio ๐ป", segment);
@Test
public void testGetCodePoint() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(0x1F4FB, segment.getCodePoint());
segment.setLength(1);
assertEquals(-1, segment.getCodePoint());
assertEquals(0x20, segment.getCodePoint());
}
- @Test
- public void testIsLeadingSurrogate() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
- assertFalse(segment.isLeadingSurrogate());
- segment.setLength(1);
- assertTrue(segment.isLeadingSurrogate());
- segment.adjustOffset(1);
- segment.setLength(1);
- assertFalse(segment.isLeadingSurrogate()); // trail, not lead
- }
-
@Test
public void testCommonPrefixLength() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+ StringSegment segment = new StringSegment(SAMPLE_STRING);
assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
assertEquals(4, segment.getCommonPrefixLength("๐ป r"));
assertEquals(3, segment.getCommonPrefixLength("๐ป x"));
assertEquals(0, segment.getCommonPrefixLength("foo"));
}
- @Test
- public void testIgnoreCase() {
- StringSegment segment = new StringSegment(SAMPLE_STRING, true);
- assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
- assertEquals(0, segment.getCommonPrefixLength("x"));
- segment.setOffset(3);
- assertEquals(5, segment.getCommonPrefixLength("RAdiO"));
- }
-
private static void assertCharSequenceEquals(CharSequence a, CharSequence b) {
assertEquals(a.length(), b.length());
for (int i = 0; i < a.length(); i++) {
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.TextTrieMap;
+import com.ibm.icu.text.UnicodeSet;
@RunWith(JUnit4.class)
public class TextTrieMapTest extends TestFmwk {
private static final Integer SUP2 = new Integer(9);
private static final Integer SUP3 = new Integer(10);
private static final Integer SUP4 = new Integer(11);
+ private static final Integer SUP5 = new Integer(12);
private static final Integer FOO = new Integer(-1);
private static final Integer BAR = new Integer(-2);
{"L๐บ1", SUP2}, // L, 0xD83D, 0xDCFA, 1
{"L๐ป", SUP3}, // L, 0xD83D, 0xDCFB
{"L๐", SUP4}, // L, 0xD83C, 0xDCCF
+ {"๐บ", SUP5}, // 0xD83D, 0xDCFA
+ {"๐ป", SUP5}, // 0xD83D, 0xDCFB
+ {"๐", SUP5}, // 0xD83C, 0xDCCF
};
private static final Object[][] TESTCASES = {
checkParse(map, test, expecteds, true);
}
+ logln("Test for partial match");
+ for (Object[] cas : TESTDATA) {
+ String str = (String) cas[0];
+ for (int i = 0; i < str.length() - 1; i++) {
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get(str.substring(0, i), 0, output);
+ assertTrue("Partial string means partial match", output.partialMatch);
+ }
+ String bad = str + "x";
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get(bad, 0, output);
+ assertFalse("No partial match on bad string", output.partialMatch);
+ }
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get("Sunday", 0, output);
+ assertFalse("No partial match on string with no continuation", output.partialMatch);
+
+ logln("Test for LeadCodePoints");
+ // Note: The ๐บ and ๐ป have the same lead surrogate
+ UnicodeSet expectedLeadCodePoints = new UnicodeSet("[SMTWFL๐บ๐ป๐]");
+ UnicodeSet actualLeadCodePoints = new UnicodeSet();
+ map.putLeadCodePoints(actualLeadCodePoints);
+ assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
+
// Add duplicated entry
map.put("Sunday", FOO);
// Add duplicated entry with different casing
checkParse(map, test, expecteds, false);
}
+ logln("Test for partial match");
+ for (Object[] cas : TESTDATA) {
+ String str = (String) cas[0];
+ for (int i = 0; i < str.length() - 1; i++) {
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get(str.substring(0, i), 0, output);
+ assertTrue("Partial string means partial match", output.partialMatch);
+ }
+ String bad = str + "x";
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get(bad, 0, output);
+ assertFalse("No partial match on bad string", output.partialMatch);
+ }
+ TextTrieMap.Output output = new TextTrieMap.Output();
+ map.get("Sunday", 0, output);
+ assertFalse("No partial match on string with no continuation", output.partialMatch);
+
+ logln("Test for LeadCodePoints");
+ UnicodeSet expectedLeadCodePoints = new UnicodeSet("[smtwfl๐บ๐ป๐]");
+ UnicodeSet actualLeadCodePoints = new UnicodeSet();
+ map.putLeadCodePoints(actualLeadCodePoints);
+ assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
+
// Add duplicated entry
map.put("Sunday", FOO);
// Add duplicated entry with different casing