From: Andy Heninger Date: Wed, 14 Feb 2018 01:31:35 +0000 (+0000) Subject: ICU-13569 RBBI state table optimizations, ICU4J, work in progress, plus branch refresh. X-Git-Tag: release-61-rc~98^2~4 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ff3ebb8c326fe67851aa442cdf52dc5ed017d51e;p=icu ICU-13569 RBBI state table optimizations, ICU4J, work in progress, plus branch refresh. X-SVN-Rev: 40914 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleDisplayNamesImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleDisplayNamesImpl.java index 2b5076cff81..54d7293c4de 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleDisplayNamesImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/LocaleDisplayNamesImpl.java @@ -91,8 +91,7 @@ public class LocaleDisplayNamesImpl extends LocaleDisplayNames { CaseMap.toTitle().wholeString().noLowercase(); private static String toTitleWholeStringNoLowercase(ULocale locale, String s) { - return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply( - locale.toLocale(), null, s, new StringBuilder(), null).toString(); + return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(locale.toLocale(), null, s); } public static LocaleDisplayNames getInstance(ULocale locale, DialectHandling dialectHandling) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Grouper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Grouper.java index fee5564de60..6e18907dbbc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Grouper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Grouper.java @@ -37,7 +37,7 @@ public class Grouper { return GROUPER_AUTO; case ON_ALIGNED: return GROUPER_ON_ALIGNED; - case WESTERN: + case THOUSANDS: return GROUPER_WESTERN; default: throw new AssertionError(); @@ -63,9 +63,9 @@ public class Grouper { return GROUPER_WESTERN; } else if (grouping1 == 3 && grouping2 == 2 && minGrouping == 1) { return GROUPER_INDIC; - } else if (grouping1 == 3 && grouping2 == 3 && minGrouping == 1) { + } else if (grouping1 == 3 && grouping2 == 3 && minGrouping == 2) { return GROUPER_WESTERN_MIN2; - } else if (grouping1 == 3 && grouping2 == 2 && minGrouping == 1) { + } else if (grouping1 == 3 && grouping2 == 2 && minGrouping == 2) { return GROUPER_INDIC_MIN2; } else { return new Grouper(grouping1, grouping2, minGrouping); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/RoundingUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/RoundingUtils.java index 9098d8c6aa2..b9a3cdb6da1 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/RoundingUtils.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/RoundingUtils.java @@ -22,7 +22,7 @@ public class RoundingUtils { * The maximum number of fraction places, integer numerals, or significant digits. TODO: This does * not feel like the best home for this value. */ - public static final int MAX_INT_FRAC_SIG = 100; + public static final int MAX_INT_FRAC_SIG = 999; /** * Converts a rounding mode and metadata about the quantity being rounded to a boolean determining diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixPatternMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixPatternMatcher.java index ee041f64ab2..43d3888579a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixPatternMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixPatternMatcher.java @@ -35,7 +35,6 @@ public class AffixPatternMatcher extends SeriesMatcher implements AffixUtils.Tok return null; } - affixPattern = ParsingUtils.maybeFold(affixPattern, parseFlags); AffixPatternMatcher series = new AffixPatternMatcher(affixPattern); series.factory = factory; series.ignorables = (0 != (parseFlags & ParsingUtils.PARSE_FLAG_EXACT_AFFIX)) ? null diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java index 385e73a5d89..8a0b7b9beb4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CodePointMatcher.java @@ -24,8 +24,8 @@ public class CodePointMatcher implements NumberParseMatcher { @Override public boolean match(StringSegment segment, ParsedNumber result) { - if (segment.getCodePoint() == cp) { - segment.adjustOffset(Character.charCount(cp)); + if (segment.matches(cp)) { + segment.adjustOffsetByCodePoint(); result.setCharsConsumed(segment); } return false; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java index e760a0142b8..d81c2e9f81a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java @@ -15,10 +15,10 @@ public class CurrencyMatcher implements NumberParseMatcher { private final String currency1; private final String currency2; - public static CurrencyMatcher getInstance(Currency currency, ULocale loc, int setupFlags) { + public static CurrencyMatcher getInstance(Currency currency, ULocale loc) { return new CurrencyMatcher(currency.getSubtype(), - ParsingUtils.maybeFold(currency.getSymbol(loc), setupFlags), - ParsingUtils.maybeFold(currency.getCurrencyCode(), setupFlags)); + currency.getSymbol(loc), + currency.getCurrencyCode()); } private CurrencyMatcher(String isoCode, String currency1, String currency2) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherFactory.java index a1e36758693..d5640d4aadb 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherFactory.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherFactory.java @@ -15,7 +15,6 @@ public class MatcherFactory { DecimalFormatSymbols symbols; IgnorablesMatcher ignorables; ULocale locale; - int parseFlags; public MinusSignMatcher minusSign(boolean allowTrailing) { return MinusSignMatcher.getInstance(symbols, allowTrailing); @@ -35,7 +34,7 @@ public class MatcherFactory { public AnyMatcher currency() { AnyMatcher any = new AnyMatcher(); - any.addMatcher(CurrencyMatcher.getInstance(currency, locale, parseFlags)); + any.addMatcher(CurrencyMatcher.getInstance(currency, locale)); any.addMatcher(CurrencyTrieMatcher.getInstance(locale)); any.freeze(); return any; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java index 7664e1e72b2..c5b01255e98 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java @@ -2,7 +2,6 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.impl.number.parse; -import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.DecimalFormatSymbols; import com.ibm.icu.text.UnicodeSet; @@ -13,14 +12,11 @@ import com.ibm.icu.text.UnicodeSet; public class NanMatcher extends SymbolMatcher { private static final NanMatcher DEFAULT = new NanMatcher("NaN"); - private static final NanMatcher DEFAULT_FOLDED = new NanMatcher(UCharacter.foldCase("NaN", true)); public static NanMatcher getInstance(DecimalFormatSymbols symbols, int parseFlags) { - String symbolString = ParsingUtils.maybeFold(symbols.getNaN(), parseFlags); + String symbolString = symbols.getNaN(); if (DEFAULT.string.equals(symbolString)) { return DEFAULT; - } else if (DEFAULT_FOLDED.string.equals(symbolString)) { - return DEFAULT_FOLDED; } else { return new NanMatcher(symbolString); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java index ff59ca052cc..3db8b782aec 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java @@ -31,6 +31,30 @@ import com.ibm.icu.util.ULocale; */ public class NumberParserImpl { + @Deprecated + public static NumberParserImpl removeMeWhenMerged(ULocale locale, String pattern, int parseFlags) { + NumberParserImpl parser = new NumberParserImpl(parseFlags); + DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale); + IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT; + + MatcherFactory factory = new MatcherFactory(); + factory.currency = Currency.getInstance("USD"); + factory.symbols = symbols; + factory.ignorables = ignorables; + factory.locale = locale; + + ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern); + AffixMatcher.newGenerate(patternInfo, parser, factory, ignorables, parseFlags); + + Grouper grouper = Grouper.forStrategy(GroupingStrategy.AUTO).withLocaleData(locale, patternInfo); + parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags)); + parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); + parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags)); + + parser.freeze(); + return parser; + } + // TODO: Find a better place for this enum. /** Controls the set of rules for parsing a string. */ public static enum ParseMode { @@ -74,12 +98,13 @@ public class NumberParserImpl { // Temporary frontend for testing. int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE - | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; + | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES + | ParsingUtils.PARSE_FLAG_OPTIMIZE; if (strictGrouping) { parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE; } - NumberParserImpl parser = new NumberParserImpl(parseFlags, true); + NumberParserImpl parser = new NumberParserImpl(parseFlags); DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale); IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT; @@ -88,7 +113,6 @@ public class NumberParserImpl { factory.symbols = symbols; factory.ignorables = ignorables; factory.locale = locale; - factory.parseFlags = parseFlags; ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern); AffixMatcher.newGenerate(patternInfo, parser, factory, ignorables, parseFlags); @@ -99,7 +123,7 @@ public class NumberParserImpl { parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags)); parser.addMatcher(MinusSignMatcher.getInstance(symbols, false)); parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags)); - parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags)); + parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper)); parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); parser.addMatcher(new RequireNumberMatcher()); @@ -193,16 +217,18 @@ public class NumberParserImpl { if (parseCurrency || patternInfo.hasCurrencySign()) { parseFlags |= ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS; } + if (optimize) { + parseFlags |= ParsingUtils.PARSE_FLAG_OPTIMIZE; + } IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT; - NumberParserImpl parser = new NumberParserImpl(parseFlags, optimize); + NumberParserImpl parser = new NumberParserImpl(parseFlags); MatcherFactory factory = new MatcherFactory(); factory.currency = currency; factory.symbols = symbols; factory.ignorables = ignorables; factory.locale = locale; - factory.parseFlags = parseFlags; ////////////////////// /// AFFIX MATCHERS /// @@ -216,7 +242,7 @@ public class NumberParserImpl { //////////////////////// if (parseCurrency || patternInfo.hasCurrencySign()) { - parser.addMatcher(CurrencyMatcher.getInstance(currency, locale, parseFlags)); + parser.addMatcher(CurrencyMatcher.getInstance(currency, locale)); parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); } @@ -239,7 +265,7 @@ public class NumberParserImpl { parser.addMatcher(ignorables); parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags)); if (!properties.getParseNoExponent()) { - parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags)); + parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper)); } ////////////////// @@ -281,18 +307,12 @@ public class NumberParserImpl { /** * Creates a new, empty parser. * - * @param ignoreCase - * If true, perform case-folding. This parameter needs to go into the constructor because - * its value is used during the construction of the matcher chain. - * @param optimize - * If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing - * runtime but increases construction runtime. If the parser is going to be used only once - * or twice, set this to false; if it is going to be used hundreds of times, set it to - * true. + * @param parseFlags + * The parser settings defined in the PARSE_FLAG_* fields. */ - public NumberParserImpl(int parseFlags, boolean optimize) { + public NumberParserImpl(int parseFlags) { matchers = new ArrayList(); - if (optimize) { + if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_OPTIMIZE)) { leadCodePointses = new ArrayList(); } else { leadCodePointses = null; @@ -306,9 +326,7 @@ public class NumberParserImpl { assert !frozen; this.matchers.add(matcher); if (leadCodePointses != null) { - UnicodeSet leadCodePoints = matcher.getLeadCodePoints(); - assert leadCodePoints.isFrozen(); - this.leadCodePointses.add(leadCodePoints); + addLeadCodePointsForMatcher(matcher); } } @@ -317,13 +335,22 @@ public class NumberParserImpl { this.matchers.addAll(matchers); if (leadCodePointses != null) { for (NumberParseMatcher matcher : matchers) { - UnicodeSet leadCodePoints = matcher.getLeadCodePoints(); - assert leadCodePoints.isFrozen(); - this.leadCodePointses.add(leadCodePoints); + addLeadCodePointsForMatcher(matcher); } } } + private void addLeadCodePointsForMatcher(NumberParseMatcher matcher) { + UnicodeSet leadCodePoints = matcher.getLeadCodePoints(); + assert leadCodePoints.isFrozen(); + // TODO: Avoid the clone operation here. + if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE)) { + leadCodePoints = leadCodePoints.cloneAsThawed().closeOver(UnicodeSet.ADD_CASE_MAPPINGS) + .freeze(); + } + this.leadCodePointses.add(leadCodePoints); + } + public void setComparator(Comparator comparator) { assert !frozen; this.comparator = comparator; @@ -353,7 +380,7 @@ public class NumberParserImpl { public void parse(String input, int start, boolean greedy, ParsedNumber result) { assert frozen; assert start >= 0 && start < input.length(); - StringSegment segment = new StringSegment(ParsingUtils.maybeFold(input, parseFlags)); + StringSegment segment = new StringSegment(input, parseFlags); segment.adjustOffset(start); if (greedy) { parseGreedyRecursive(segment, result); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java index c4a0005c0e7..4d17cd618af 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java @@ -2,7 +2,6 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.impl.number.parse; -import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.EntryRange; @@ -23,6 +22,7 @@ public class ParsingUtils { public static final int PARSE_FLAG_EXACT_AFFIX = 0x0200; public static final int PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400; public static final int PARSE_FLAG_FRACTION_GROUPING_DISABLED = 0x0800; + public static final int PARSE_FLAG_OPTIMIZE = 0x1000; public static void putLeadCodePoints(UnicodeSet input, UnicodeSet output) { for (EntryRange range : input.ranges()) { @@ -39,16 +39,4 @@ public class ParsingUtils { } } - /** - * Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string. - */ - public static String maybeFold(String input, int parseFlags) { - UnicodeSet cwcf = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CWCF); - if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && cwcf.containsSome(input)) { - return UCharacter.foldCase(input, true); - } else { - return input; - } - } - } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java index a6c053af7ea..329ee12ba6f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java @@ -15,16 +15,13 @@ public class ScientificMatcher implements NumberParseMatcher { private final String exponentSeparatorString; private final DecimalMatcher exponentMatcher; - public static ScientificMatcher getInstance( - DecimalFormatSymbols symbols, - Grouper grouper, - int parseFlags) { + public static ScientificMatcher getInstance(DecimalFormatSymbols symbols, Grouper grouper) { // TODO: Static-initialize most common instances? - return new ScientificMatcher(symbols, grouper, parseFlags); + return new ScientificMatcher(symbols, grouper); } - private ScientificMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) { - exponentSeparatorString = ParsingUtils.maybeFold(symbols.getExponentSeparator(), parseFlags); + private ScientificMatcher(DecimalFormatSymbols symbols, Grouper grouper) { + exponentSeparatorString = symbols.getExponentSeparator(); exponentMatcher = DecimalMatcher.getInstance(symbols, grouper, ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC | ParsingUtils.PARSE_FLAG_INTEGER_ONLY); @@ -47,19 +44,14 @@ public class ScientificMatcher implements NumberParseMatcher { if (segment.length() == 0) { return true; } - int leadCp = segment.getCodePoint(); - if (leadCp == -1) { - // Partial code point match - return true; - } // Allow a sign, and then try to match digits. boolean minusSign = false; - if (UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.MINUS_SIGN).contains(leadCp)) { + if (segment.matches(UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.MINUS_SIGN))) { minusSign = true; - segment.adjustOffset(Character.charCount(leadCp)); - } else if (UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.PLUS_SIGN).contains(leadCp)) { - segment.adjustOffset(Character.charCount(leadCp)); + segment.adjustOffsetByCodePoint(); + } else if (segment.matches(UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.PLUS_SIGN))) { + segment.adjustOffsetByCodePoint(); } int digitsOffset = segment.getOffset(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java index 6b92df6e368..eb2b27bc8ee 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java @@ -2,6 +2,9 @@ // License & terms of use: http://www.unicode.org/copyright.html#License package com.ibm.icu.impl.number.parse; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UnicodeSet; + /** * A mutable class allowing for a String with a variable offset and length. The charAt, length, and * subSequence methods all operate relative to the fixed offset into the String. @@ -12,11 +15,13 @@ public class StringSegment implements CharSequence { private final String str; private int start; private int end; + private boolean foldCase; - public StringSegment(String str) { + public StringSegment(String str, int parseFlags) { this.str = str; this.start = 0; this.end = str.length(); + this.foldCase = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE); } public int getOffset() { @@ -42,6 +47,13 @@ public class StringSegment implements CharSequence { start += delta; } + /** + * Adjusts the offset by the width of the current code point, either 1 or 2 chars. + */ + public void adjustOffsetByCodePoint() { + start += Character.charCount(getCodePoint()); + } + public void setLength(int length) { assert length >= 0; assert start + length <= str.length(); @@ -72,28 +84,73 @@ public class StringSegment implements CharSequence { /** * Returns the first code point in the string segment, or -1 if the string starts with an invalid * code point. + * + *

+ * Important: Most of the time, you should use {@link #matches}, which handles case + * folding logic, instead of this method. */ public int getCodePoint() { assert start < end; char lead = str.charAt(start); - if (Character.isHighSurrogate(lead) && start + 1 < end) { - return Character.toCodePoint(lead, str.charAt(start + 1)); - } else if (Character.isSurrogate(lead)) { - return -1; - } else { - return lead; + char trail; + if (Character.isHighSurrogate(lead) + && start + 1 < end + && Character.isLowSurrogate(trail = str.charAt(start + 1))) { + return Character.toCodePoint(lead, trail); } + return lead; + } + + /** + * Returns true if the first code point of this StringSegment equals the given code point. + * + *

+ * This method will perform case folding if case folding is enabled for the parser. + */ + public boolean matches(int otherCp) { + return codePointsEqual(getCodePoint(), otherCp, foldCase); + } + + /** + * Returns true if the first code point of this StringSegment is in the given UnicodeSet. + */ + public boolean matches(UnicodeSet uniset) { + // TODO: Move UnicodeSet case-folding logic here. + // TODO: Handle string matches here instead of separately. + int cp = getCodePoint(); + if (cp == -1) { + return false; + } + return uniset.contains(cp); } /** * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, * since the first 2 characters are the same. + * + *

+ * This method will perform case folding if case folding is enabled for the parser. */ public int getCommonPrefixLength(CharSequence other) { + return getPrefixLengthInternal(other, foldCase); + } + + /** + * Like {@link #getCommonPrefixLength}, but never performs case folding, even if case folding is + * enabled for the parser. + */ + public int getCaseSensitivePrefixLength(CharSequence other) { + return getPrefixLengthInternal(other, false); + } + + private int getPrefixLengthInternal(CharSequence other, boolean foldCase) { int offset = 0; for (; offset < Math.min(length(), other.length());) { - if (charAt(offset) != other.charAt(offset)) { + // TODO: case-fold code points, not chars + char c1 = charAt(offset); + char c2 = other.charAt(offset); + if (!codePointsEqual(c1, c2, foldCase)) { break; } offset++; @@ -101,6 +158,30 @@ public class StringSegment implements CharSequence { return offset; } + // /** + // * Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string. + // */ + // public static String maybeFold(String input, int parseFlags) { + // UnicodeSet cwcf = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CWCF); + // if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE) && cwcf.containsSome(input)) { + // return UCharacter.foldCase(input, true); + // } else { + // return input; + // } + // } + + private static final boolean codePointsEqual(int cp1, int cp2, boolean foldCase) { + if (cp1 == cp2) { + return true; + } + if (!foldCase) { + return false; + } + cp1 = UCharacter.foldCase(cp1, true); + cp2 = UCharacter.foldCase(cp2, true); + return cp1 == cp2; + } + @Override public String toString() { return str.substring(0, start) + "[" + str.substring(start, end) + "]" + str.substring(end); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java index e31841e6872..bf15d726b7a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java @@ -47,9 +47,8 @@ public abstract class SymbolMatcher implements NumberParseMatcher { } } - int cp = segment.getCodePoint(); - if (cp != -1 && uniSet.contains(cp)) { - segment.adjustOffset(Character.charCount(cp)); + if (segment.matches(uniSet)) { + segment.adjustOffsetByCodePoint(); accept(segment, result); return false; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java index bf0593e1230..d458f07de35 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java @@ -51,7 +51,7 @@ public class UnicodeSetStaticCache { DIGITS, NAN_LEAD, SCIENTIFIC_LEAD, - CWCF, + CWCF, // TODO: Check if this is being used and remove it if not. // Combined Separators with Digits (for lead code points) DIGITS_OR_ALL_SEPARATORS, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java index be1545a1451..9580c9fe10f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java @@ -5123,37 +5123,6 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection return CaseMapImpl.toTitle(getCaseLocale(locale), options, titleIter, str); } - /** - * Return a string with just the first word titlecased, for menus and UI, etc. This does not affect most of the string, - * and sometimes has no effect at all; the original string is returned whenever casing - * would not be appropriate for the first word (such as for CJK characters or initial numbers). - * Initial non-letters are skipped in order to find the character to change. - * Characters past the first affected are left untouched: see also TITLECASE_NO_LOWERCASE. - *

Examples: - * - * - * - * - * - * - * - * - *
SourceResultLocale
anglo-American localeAnglo-American locale
“contact us”“Contact us”
49ers win!49ers win!
丰(abc)丰(abc)
«ijs»«Ijs»
«ijs»«IJs»nl-BE
«ijs»«İjs»tr-DE
- * @param locale the locale for accessing exceptional behavior (eg for tr). - * @param str the source string to change - * @return the modified string, or the original if no modifications were necessary. - * @internal - * @deprecated ICU internal only - */ - @Deprecated - public static String toTitleFirst(ULocale locale, String str) { - // TODO: Remove this function. Inline it where it is called in CLDR. - return TO_TITLE_WHOLE_STRING_NO_LOWERCASE.apply(locale.toLocale(), null, str); - } - - private static final com.ibm.icu.text.CaseMap.Title TO_TITLE_WHOLE_STRING_NO_LOWERCASE = - com.ibm.icu.text.CaseMap.toTitle().wholeString().noLowercase(); - /** * {@icu}

Returns the titlecase version of the argument string. *

Position for titlecasing is determined by the argument break diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java b/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java index cec506f6397..b267fd2be6a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/number/FormattedNumber.java @@ -15,8 +15,8 @@ import com.ibm.icu.text.PluralRules.IFixedDecimal; import com.ibm.icu.util.ICUUncheckedIOException; /** - * The result of a number formatting operation. This class allows the result to be exported in several data types, - * including a String, an AttributedCharacterIterator, and a BigDecimal. + * The result of a number formatting operation. This class allows the result to be exported in several + * data types, including a String, an AttributedCharacterIterator, and a BigDecimal. * * @draft ICU 60 * @provisional This API might change or be removed in a future release. @@ -47,12 +47,12 @@ public class FormattedNumber { } /** - * Append the formatted number to an Appendable, such as a StringBuilder. This may be slightly more efficient than - * creating a String. + * Append the formatted number to an Appendable, such as a StringBuilder. This may be slightly more + * efficient than creating a String. * *

- * If an IOException occurs when appending to the Appendable, an unchecked {@link ICUUncheckedIOException} is thrown - * instead. + * If an IOException occurs when appending to the Appendable, an unchecked + * {@link ICUUncheckedIOException} is thrown instead. * * @param appendable * The Appendable to which to append the formatted number string. @@ -73,16 +73,18 @@ public class FormattedNumber { } /** - * Determine the start and end indices of the first occurrence of the given field in the output string. - * This allows you to determine the locations of the integer part, fraction part, and sign. + * Determine the start and end indices of the first occurrence of the given field in the + * output string. This allows you to determine the locations of the integer part, fraction part, and + * sign. * *

- * If multiple different field attributes are needed, this method can be called repeatedly, or if all field - * attributes are needed, consider using getFieldIterator(). + * If multiple different field attributes are needed, this method can be called repeatedly, or if + * all field attributes are needed, consider using getFieldIterator(). * *

- * If a field occurs multiple times in an output string, such as a grouping separator, this method will only ever - * return the first occurrence. Use getFieldIterator() to access all occurrences of an attribute. + * If a field occurs multiple times in an output string, such as a grouping separator, this method + * will only ever return the first occurrence. Use getFieldIterator() to access all occurrences of an + * attribute. * * @param fieldPosition * The FieldPosition to populate with the start and end indices of the desired field. @@ -106,13 +108,15 @@ public class FormattedNumber { } /** - * Export the formatted number as an AttributedCharacterIterator. This allows you to determine which characters in - * the output string correspond to which fields, such as the integer part, fraction part, and sign. + * Export the formatted number as an AttributedCharacterIterator. This allows you to determine which + * characters in the output string correspond to which fields, such as the integer part, + * fraction part, and sign. * *

* If information on only one field is needed, consider using populateFieldPosition() instead. * - * @return An AttributedCharacterIterator, containing information on the field attributes of the number string. + * @return An AttributedCharacterIterator, containing information on the field attributes of the + * number string. * @draft ICU 60 * @provisional This API might change or be removed in a future release. * @see com.ibm.icu.text.NumberFormat.Field @@ -124,8 +128,9 @@ public class FormattedNumber { } /** - * Export the formatted number as a BigDecimal. This endpoint is useful for obtaining the exact number being printed - * after scaling and rounding have been applied by the number formatting pipeline. + * Export the formatted number as a BigDecimal. This endpoint is useful for obtaining the exact + * number being printed after scaling and rounding have been applied by the number formatting + * pipeline. * * @return A BigDecimal representation of the formatted number. * @draft ICU 60 @@ -138,31 +143,29 @@ public class FormattedNumber { /** * @internal - * @deprecated This API is ICU internal only. + * @deprecated This API is ICU internal only. Use {@link #populateFieldPosition} or + * {@link #getFieldIterator} for similar functionality. */ @Deprecated public String getPrefix() { NumberStringBuilder temp = new NumberStringBuilder(); - int length = micros.modOuter.apply(temp, 0, 0); - length += micros.modMiddle.apply(temp, 0, length); - /* length += */ micros.modInner.apply(temp, 0, length); - int prefixLength = micros.modOuter.getPrefixLength() + micros.modMiddle.getPrefixLength() - + micros.modInner.getPrefixLength(); + // #13453: DecimalFormat wants the affixes from the pattern only (modMiddle). + micros.modMiddle.apply(temp, 0, 0); + int prefixLength = micros.modMiddle.getPrefixLength(); return temp.subSequence(0, prefixLength).toString(); } /** * @internal - * @deprecated This API is ICU internal only. + * @deprecated This API is ICU internal only. Use {@link #populateFieldPosition} or + * {@link #getFieldIterator} for similar functionality. */ @Deprecated public String getSuffix() { NumberStringBuilder temp = new NumberStringBuilder(); - int length = micros.modOuter.apply(temp, 0, 0); - length += micros.modMiddle.apply(temp, 0, length); - length += micros.modInner.apply(temp, 0, length); - int prefixLength = micros.modOuter.getPrefixLength() + micros.modMiddle.getPrefixLength() - + micros.modInner.getPrefixLength(); + // #13453: DecimalFormat wants the affixes from the pattern only (modMiddle). + int length = micros.modMiddle.apply(temp, 0, 0); + int prefixLength = micros.modMiddle.getPrefixLength(); return temp.subSequence(prefixLength, length).toString(); } @@ -185,7 +188,9 @@ public class FormattedNumber { public int hashCode() { // NumberStringBuilder and BigDecimal are mutable, so we can't call // #equals() or #hashCode() on them directly. - return Arrays.hashCode(nsb.toCharArray()) ^ Arrays.hashCode(nsb.toFieldArray()) ^ fq.toBigDecimal().hashCode(); + return Arrays.hashCode(nsb.toCharArray()) + ^ Arrays.hashCode(nsb.toFieldArray()) + ^ fq.toBigDecimal().hashCode(); } /** @@ -206,7 +211,7 @@ public class FormattedNumber { // #equals() or #hashCode() on them directly. FormattedNumber _other = (FormattedNumber) other; return Arrays.equals(nsb.toCharArray(), _other.nsb.toCharArray()) - ^ Arrays.equals(nsb.toFieldArray(), _other.nsb.toFieldArray()) - ^ fq.toBigDecimal().equals(_other.fq.toBigDecimal()); + && Arrays.equals(nsb.toFieldArray(), _other.nsb.toFieldArray()) + && fq.toBigDecimal().equals(_other.fq.toBigDecimal()); } } \ No newline at end of file diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java index d66cb2afdec..dd7651bd820 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java @@ -171,7 +171,7 @@ public final class NumberFormatter { *

  • MIN2: 1234 and 12,34,567 *
  • AUTO: 1,234 and 12,34,567 *
  • ON_ALIGNED: 1,234 and 12,34,567 - *
  • WESTERN: 1,234 and 1,234,567 + *
  • THOUSANDS: 1,234 and 1,234,567 * * *

    @@ -259,7 +259,7 @@ public final class NumberFormatter { * @provisional This API might change or be removed in a future release. * @see NumberFormatter */ - WESTERN + THOUSANDS } /** diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/MeasureFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/MeasureFormat.java index 834d6c51ec9..0d950cbc0a3 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/MeasureFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/MeasureFormat.java @@ -1044,7 +1044,7 @@ public class MeasureFormat extends UFormat { case TIME_UNIT_FORMAT: return createTimeUnitFormat(); case CURRENCY_FORMAT: - return new CurrencyFormat(locale); + return MeasureFormat.getCurrencyFormat(locale); default: throw new InvalidObjectException("Unknown subclass: " + subClass); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java index 977ba6f579f..cf39ca0c4ec 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIDataWrapper.java @@ -18,17 +18,19 @@ import com.ibm.icu.impl.ICUBinary.Authenticate; import com.ibm.icu.impl.Trie2; /** -*

    Internal class used for Rule Based Break Iterators

    +*

    Internal class used for Rule Based Break Iterators.

    *

    This class provides access to the compiled break rule data, as * it is stored in a .brk file. +* Not intended for public use; declared public for testing purposes only. +* @internal */ -final class RBBIDataWrapper { +public final class RBBIDataWrapper { // // These fields are the ready-to-use compiled rule data, as // read from the file. // - RBBIDataHeader fHeader; - short fFTable[]; + public RBBIDataHeader fHeader; + public short fFTable[]; short fRTable[]; short fSFTable[]; short fSRTable[]; @@ -78,11 +80,16 @@ final class RBBIDataWrapper { // Index offsets to the fields in a state table row. // Corresponds to struct RBBIStateTableRow in the C version. // - final static int ACCEPTING = 0; - final static int LOOKAHEAD = 1; - final static int TAGIDX = 2; - final static int RESERVED = 3; - final static int NEXTSTATES = 4; + /** @internal */ + public final static int ACCEPTING = 0; + /** @internal */ + public final static int LOOKAHEAD = 1; + /** @internal */ + public final static int TAGIDX = 2; + /** @internal */ + public final static int RESERVED = 3; + /** @internal */ + public final static int NEXTSTATES = 4; // Index offsets to header fields of a state table // struct RBBIStateTable {... in the C version. @@ -101,13 +108,15 @@ final class RBBIDataWrapper { /** * Data Header. A struct-like class with the fields from the RBBI data file header. + * Not intended for public use, declared public for testing purposes only. + * @internal */ - final static class RBBIDataHeader { + public final static class RBBIDataHeader { int fMagic; // == 0xbla0 byte[] fFormatVersion; // For ICU 3.4 and later. int fLength; // Total length in bytes of this RBBI Data, // including all sections, not just the header. - int fCatCount; // Number of character categories. + public int fCatCount; // Number of character categories. // // Offsets and sizes of each of the subsections within the RBBI data. @@ -139,9 +148,9 @@ final class RBBIDataWrapper { /** * RBBI State Table Indexing Function. Given a state number, return the * array index of the start of the state table row for that state. - * + * @internal */ - int getRowIndex(int state){ + public int getRowIndex(int state){ return ROW_DATA + state * (fHeader.fCatCount + 4); } @@ -311,17 +320,17 @@ final class RBBIDataWrapper { return This; } - ///CLOVER:OFF - // Getters for fields from the state table header - // - private int getStateTableNumStates(short table[]) { + /** + * Getters for fields from the state table header + * @internal + */ + public int getStateTableNumStates(short table[]) { if (isBigEndian) { return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff); } else { return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff); } } - ///CLOVER:ON int getStateTableFlags(short table[]) { // This works for up to 15 flags bits. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index c3bdaa23b72..87ea903bd25 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -342,10 +342,10 @@ class RBBIRuleBuilder { // // UnicodeSet processing. // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input 32-bit characters to + // Generate the mapping tables (TRIE) from input code points to // the character categories. // - builder.fSetBuilder.build(); + builder.fSetBuilder.buildRanges(); // // Generate the DFA state transition table. @@ -363,10 +363,34 @@ class RBBIRuleBuilder { builder.fForwardTables.printRuleStatusTable(); } + builder.optimizeTables(); + builder.fSetBuilder.buildTrie(); // // Package up the compiled data, writing it to an output stream // in the serialization format. This is the same as the ICU4C runtime format. // builder.flattenData(os); } + + static class ClassPair { + int left = 3; + int right = 0; + } + + void optimizeTables() { + ClassPair duplPair = new ClassPair(); + + while (fForwardTables.findDuplCharClassFrom(duplPair)) { + fSetBuilder.mergeCategories(duplPair); + fForwardTables.removeColumn(duplPair.right); + fReverseTables.removeColumn(duplPair.right); + fSafeFwdTables.removeColumn(duplPair.right); + fSafeRevTables.removeColumn(duplPair.right); + } + + fForwardTables.removeDuplicateStates(); + fReverseTables.removeDuplicateStates(); + fSafeFwdTables.removeDuplicateStates(); + fSafeRevTables.removeDuplicateStates(); + } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java index 44352cb9674..9f5a8a50a2c 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java @@ -112,7 +112,7 @@ class RBBISetBuilder { } } if (setName.equals("dictionary")) { - this.fNum |= 0x4000; + this.fNum |= DICT_BIT; break; } } @@ -138,6 +138,8 @@ class RBBISetBuilder { boolean fSawBOF; + static final int DICT_BIT = 0x4000; + //------------------------------------------------------------------------ // @@ -156,7 +158,7 @@ class RBBISetBuilder { // from the Unicode Sets. // //------------------------------------------------------------------------ - void build() { + void buildRanges() { RangeDescriptor rlRange; if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();} @@ -280,6 +282,15 @@ class RBBISetBuilder { if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} + } + + + /** + * Build the Trie table for mapping UChar32 values to the corresponding + * range group number. + */ + void buildTrie() { + RangeDescriptor rlRange; fTrie = new Trie2Writable(0, // Initial value for all code points. 0); // Error value for out-of-range input. @@ -294,7 +305,20 @@ class RBBISetBuilder { } } - + void mergeCategories(int left, int right) { + assert(left >= 1); + assert(right > left); + for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) { + int rangeNum = rd.fNum & ~DICT_BIT; + int rangeDict = rd.fNum & DICT_BIT; + if (rangeNum == right) { + rd.fNum = left | rangeDict; + } else if (rangeNum > right) { + rd.fNum--; + } + } + --fGroupCount; + } //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. @@ -457,7 +481,7 @@ class RBBISetBuilder { if (groupNum<10) {System.out.print(" ");} System.out.print(groupNum + " "); - if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" ");} + if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" ");} for (i=0; i duplState) { + newVal = existingVal - 1; + } + sd.fDtran.setElementAt(newVal, col); + } + if (sd.fAccepting == duplState) { + sd.fAccepting = keepState; + } else if (sd.fAccepting > duplState) { + sd.fAccepting--; + } + if (sd.fLookAhead == duplState) { + sd.fLookAhead = keepState; + } else if (sd.fLookAhead > duplState) { + sd.fLookAhead--; + } + } +} + + +/* + * RemoveDuplicateStates + */ +void removeDuplicateStates() { + int firstState = 3; + int duplicateState = 0; + while (findDuplicateState(firstState, duplicateState)) { + // printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState); + removeState(firstState, duplicateState); + } +} + //----------------------------------------------------------------------------- // diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index b39aad503db..8064ab36e73 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -222,9 +222,11 @@ public class RuleBasedBreakIterator extends BreakIterator { private CharacterIterator fText = new java.text.StringCharacterIterator(""); /** - * The rule data for this BreakIterator instance. Package private. + * The rule data for this BreakIterator instance. + * Not intended for public use. Declared public for testing purposes only. + * @internal */ - RBBIDataWrapper fRData; + public RBBIDataWrapper fRData; /** * The iteration state - current position, rule status for the current position, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/TimeUnitFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/TimeUnitFormat.java index f8fff0b15f3..cd0ef2dd6aa 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/TimeUnitFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/TimeUnitFormat.java @@ -84,19 +84,12 @@ public class TimeUnitFormat extends MeasureFormat { private static final long serialVersionUID = -3707773153184971529L; - // These fields are supposed to be the same as the fields in mf. They - // are here for serialization backward compatibility and to support parsing. + // Unlike MeasureFormat, this class is mutable and allows a new NumberFormat to be set after + // initialization. Keep a second copy of NumberFormat and use it instead of the one from the parent. private NumberFormat format; private ULocale locale; private int style; - // We use this field in lieu of the super class because the super class - // is immutable while this class is mutable. The contents of the super class - // is an empty shell. Every public method of the super class is overridden to - // delegate to this field. Each time this object mutates, it replaces this field with - // a new immutable instance. -// private transient MeasureFormat mf; - private transient Map> timeUnitToCountToPatterns; private transient PluralRules pluralRules; private transient boolean isReady; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/calendar/CalendarRegressionTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/calendar/CalendarRegressionTest.java index 062ac0bd7a4..4371b672039 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/calendar/CalendarRegressionTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/calendar/CalendarRegressionTest.java @@ -2521,5 +2521,25 @@ public class CalendarRegressionTest extends com.ibm.icu.dev.test.TestFmwk { } } } - } + + @Test + public void TestIslamicCalOverflow() { + String localeID = "ar@calendar=islamic-civil"; + Calendar cal = Calendar.getInstance(new ULocale(localeID)); + int maxMonth = cal.getMaximum(Calendar.MONTH); + int maxDayOfMonth = cal.getMaximum(Calendar.DATE); + int jd, year, month, dayOfMonth; + for (jd = 73530872; jd <= 73530876; jd++) { // year 202002, int32_t overflow if jd >= 73530874 + cal.clear(); + cal.set(Calendar.JULIAN_DAY, jd); + year = cal.get(Calendar.YEAR); + month = cal.get(Calendar.MONTH); + dayOfMonth = cal.get(Calendar.DATE); + if (month > maxMonth || dayOfMonth > maxDayOfMonth) { + errln("Error: localeID " + localeID + ", julianDay " + jd + "; got year " + year + "; maxMonth " + maxMonth + + ", got month " + month + "; maxDayOfMonth " + maxDayOfMonth + ", got dayOfMonth " + dayOfMonth); + } + } + } +} //eof diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java index 1f89eb7e69c..724247a5854 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java @@ -311,6 +311,36 @@ public class IntlTestDecimalFormatSymbols extends TestFmwk errln("ERROR: Char digits should be Latin digits"); } + // Check on copy + DecimalFormatSymbols copy = (DecimalFormatSymbols) symbols.clone(); + if (!Arrays.equals(copy.getDigitStrings(), osmanyaDigitStrings)) { + errln("ERROR: Osmanya digits (supplementary) should be set"); + } + if (Character.codePointAt(osmanyaDigitStrings[0], 0) != copy.getCodePointZero()) { + errln("ERROR: Code point zero be Osmanya code point zero"); + } + if (defZero != copy.getZeroDigit()) { + errln("ERROR: Zero digit should be 0"); + } + if (!Arrays.equals(copy.getDigits(), defDigits)) { + errln("ERROR: Char digits should be Latin digits"); + } + + // Check on resource bundle + DecimalFormatSymbols fromData = DecimalFormatSymbols.getInstance(new ULocale("en@numbers=osma")); + if (!Arrays.equals(fromData.getDigitStrings(), osmanyaDigitStrings)) { + errln("ERROR: Osmanya digits (supplementary) should be set"); + } + if (Character.codePointAt(osmanyaDigitStrings[0], 0) != fromData.getCodePointZero()) { + errln("ERROR: Code point zero be Osmanya code point zero"); + } + if (defZero != fromData.getZeroDigit()) { + errln("ERROR: Zero digit should be 0"); + } + if (!Arrays.equals(fromData.getDigits(), defDigits)) { + errln("ERROR: Char digits should be Latin digits"); + } + symbols.setDigitStrings(differentDigitStrings); if (!Arrays.equals(symbols.getDigitStrings(), differentDigitStrings)) { errln("ERROR: Different digits should be set"); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/MeasureUnitTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/MeasureUnitTest.java index 7320845580c..e5e81bb5d26 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/MeasureUnitTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/MeasureUnitTest.java @@ -16,6 +16,7 @@ import java.io.ObjectOutputStream; import java.io.Serializable; import java.lang.reflect.Field; import java.text.FieldPosition; +import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -42,6 +43,7 @@ import com.ibm.icu.text.MeasureFormat; import com.ibm.icu.text.MeasureFormat.FormatWidth; import com.ibm.icu.text.NumberFormat; import com.ibm.icu.util.Currency; +import com.ibm.icu.util.CurrencyAmount; import com.ibm.icu.util.Measure; import com.ibm.icu.util.MeasureUnit; import com.ibm.icu.util.NoUnit; @@ -1925,6 +1927,15 @@ public class MeasureUnitTest extends TestFmwk { assertEquals("getCurrencyFormat ULocale/Locale", mfu, mfj); } + @Test + public void testCurrencyFormatParseIsoCode() throws ParseException { + MeasureFormat mf = MeasureFormat.getCurrencyFormat(ULocale.ENGLISH); + CurrencyAmount result = (CurrencyAmount) mf.parseObject("GTQ 34.56"); + assertEquals("Parse should succeed", result.getNumber().doubleValue(), 34.56, 0.0); + assertEquals("Should parse ISO code GTQ even though the currency is USD", + "GTQ", result.getCurrency().getCurrencyCode()); + } + @Test public void testDoubleZero() { ULocale en = new ULocale("en"); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index 092fd04028a..4697abd5da4 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -868,7 +868,7 @@ public class NumberFormatTest extends TestFmwk { new ParseCurrencyItem( "en_GB", "euros4", "4,00\u00A0\u20AC", 6,400, "EUR" ), new ParseCurrencyItem( "en_GB", "euros6", "6\u00A0\u20AC", 3, 6, "EUR" ), new ParseCurrencyItem( "en_GB", "euros8", "\u20AC8", 2, 8, "EUR" ), - new ParseCurrencyItem( "en_GB", "dollars4", "US$4", 0, 0, "USD" ), + new ParseCurrencyItem( "en_GB", "dollars4", "US$4", 4, 4, "USD" ), new ParseCurrencyItem( "fr_FR", "euros4", "4,00\u00A0\u20AC", 6, 4, "EUR" ), new ParseCurrencyItem( "fr_FR", "euros6", "6\u00A0\u20AC", 3, 6, "EUR" ), @@ -2018,7 +2018,6 @@ public class NumberFormatTest extends TestFmwk { }; @SuppressWarnings("resource") // InputStream is will be closed by the ResourceReader. - @Ignore("TODO: http://bugs.icu-project.org/trac/ticket/13571") @Test public void TestCases() { String caseFileName = "NumberFormatTestCases.txt"; @@ -5331,6 +5330,23 @@ public class NumberFormatTest extends TestFmwk { assertEquals("Grouping should be off", false, df.isGroupingUsed()); } + @Test + public void Test13453_AffixContent() { + DecimalFormat df = (DecimalFormat) DecimalFormat.getScientificInstance(); + assertEquals("Scientific should NOT be included", "", df.getPositiveSuffix()); + + df = CompactDecimalFormat.getInstance(ULocale.ENGLISH, CompactDecimalFormat.CompactStyle.SHORT); + assertEquals("Compact should NOT be included", "", df.getPositiveSuffix()); + + df = (DecimalFormat) DecimalFormat.getInstance(NumberFormat.ISOCURRENCYSTYLE); + df.setCurrency(Currency.getInstance("GBP")); + assertEquals("ISO currency SHOULD be included", "GBP", df.getPositivePrefix()); + + df = (DecimalFormat) DecimalFormat.getInstance(NumberFormat.PLURALCURRENCYSTYLE); + df.setCurrency(Currency.getInstance("GBP")); + assertEquals("Plural name SHOULD be included", " British pounds", df.getPositiveSuffix()); + } + @Test public void Test11035_FormatCurrencyAmount() { double amount = 12345.67; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java index 0bfca024c13..fe01e1e3ec9 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberFormatterApiTest.java @@ -25,7 +25,6 @@ import com.ibm.icu.impl.number.Padder; import com.ibm.icu.impl.number.Padder.PadPosition; import com.ibm.icu.impl.number.PatternStringParser; import com.ibm.icu.number.CompactNotation; -import com.ibm.icu.number.FormattedNumber; import com.ibm.icu.number.FractionRounder; import com.ibm.icu.number.IntegerWidth; import com.ibm.icu.number.LocalizedNumberFormatter; @@ -1176,6 +1175,21 @@ public class NumberFormatterApiTest { "8.765", "0"); + assertFormatDescendingBig( + "Indic locale with THOUSANDS grouping", + "", + NumberFormatter.with().grouping(GroupingStrategy.THOUSANDS), + new ULocale("en-IN"), + "87,650,000", + "8,765,000", + "876,500", + "87,650", + "8,765", + "876.5", + "87.65", + "8.765", + "0"); + // NOTE: Hungarian is interesting because it has minimumGroupingDigits=4 in locale data // If this test breaks due to data changes, find another locale that has minimumGroupingDigits. assertFormatDescendingBig( @@ -1860,29 +1874,6 @@ public class NumberFormatterApiTest { assertNotEquals(NumberFormatter.with().locale(ULocale.ENGLISH), NumberFormatter.with().locale(Locale.FRENCH)); } - @Test - public void getPrefixSuffix() { - Object[][] cases = { - { NumberFormatter.withLocale(ULocale.ENGLISH).unit(GBP).unitWidth(UnitWidth.ISO_CODE), "GBP", "", - "-GBP", "" }, - { NumberFormatter.withLocale(ULocale.ENGLISH).unit(GBP).unitWidth(UnitWidth.FULL_NAME), "", - " British pounds", "-", " British pounds" } }; - - for (Object[] cas : cases) { - LocalizedNumberFormatter f = (LocalizedNumberFormatter) cas[0]; - String posPrefix = (String) cas[1]; - String posSuffix = (String) cas[2]; - String negPrefix = (String) cas[3]; - String negSuffix = (String) cas[4]; - FormattedNumber positive = f.format(1); - FormattedNumber negative = f.format(-1); - assertEquals(posPrefix, positive.getPrefix()); - assertEquals(posSuffix, positive.getSuffix()); - assertEquals(negPrefix, negative.getPrefix()); - assertEquals(negSuffix, negative.getSuffix()); - } - } - @Test public void plurals() { // TODO: Expand this test. @@ -1921,12 +1912,12 @@ public class NumberFormatterApiTest { Rounder.class.getDeclaredMethod("minMaxFraction", Integer.TYPE, Integer.TYPE), Rounder.class.getDeclaredMethod("minMaxDigits", Integer.TYPE, Integer.TYPE), }; - final int EXPECTED_MAX_INT_FRAC_SIG = 100; - final String expectedSubstring0 = "between 0 and 100 (inclusive)"; - final String expectedSubstring1 = "between 1 and 100 (inclusive)"; - final String expectedSubstringN1 = "between -1 and 100 (inclusive)"; + final int EXPECTED_MAX_INT_FRAC_SIG = 999; + final String expectedSubstring0 = "between 0 and 999 (inclusive)"; + final String expectedSubstring1 = "between 1 and 999 (inclusive)"; + final String expectedSubstringN1 = "between -1 and 999 (inclusive)"; - // We require that the upper bounds all be 100 inclusive. + // We require that the upper bounds all be 999 inclusive. // The lower bound may be either -1, 0, or 1. Set methodsWithLowerBound1 = new HashSet(); methodsWithLowerBound1.add("fixedDigits"); @@ -1936,6 +1927,12 @@ public class NumberFormatterApiTest { methodsWithLowerBound1.add("withMinDigits"); methodsWithLowerBound1.add("withMaxDigits"); methodsWithLowerBound1.add("withMinExponentDigits"); + // Methods with lower bound 0: + // fixedFraction + // minFraction + // maxFraction + // minMaxFraction + // zeroFillTo Set methodsWithLowerBoundN1 = new HashSet(); methodsWithLowerBoundN1.add("truncateAt"); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java index 4e69a762581..70513ce5f1a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java @@ -13,6 +13,7 @@ import com.ibm.icu.impl.number.parse.IgnorablesMatcher; import com.ibm.icu.impl.number.parse.MinusSignMatcher; import com.ibm.icu.impl.number.parse.NumberParserImpl; import com.ibm.icu.impl.number.parse.ParsedNumber; +import com.ibm.icu.impl.number.parse.ParsingUtils; import com.ibm.icu.impl.number.parse.PercentMatcher; import com.ibm.icu.impl.number.parse.PlusSignMatcher; import com.ibm.icu.impl.number.parse.SeriesMatcher; @@ -191,7 +192,7 @@ public class NumberParserTest { int expectedOffset = (Integer) cas[1]; boolean expectedMaybeMore = (Boolean) cas[2]; - StringSegment segment = new StringSegment(input); + StringSegment segment = new StringSegment(input, 0); ParsedNumber result = new ParsedNumber(); boolean actualMaybeMore = series.match(segment, result); int actualOffset = segment.getOffset(); @@ -215,4 +216,39 @@ public class NumberParserTest { result.getNumber().doubleValue(), 0.0); } + + @Test + public void testCaseFolding() { + Object[][] cases = new Object[][] { + // pattern, input string, case sensitive chars, case insensitive chars + { "0", "JP¥3456", 7, 7 }, + { "0", "jp¥3456", 0, 0 }, // not to be accepted, even in case insensitive mode + { "A0", "A5", 2, 2 }, + { "A0", "a5", 0, 2 }, + { "0", "NaN", 3, 3 }, + { "0", "nan", 0, 3 } }; + for (Object[] cas : cases) { + String patternString = (String) cas[0]; + String inputString = (String) cas[1]; + int expectedCaseSensitiveChars = (Integer) cas[2]; + int expectedCaseFoldingChars = (Integer) cas[3]; + + NumberParserImpl caseSensitiveParser = NumberParserImpl + .removeMeWhenMerged(ULocale.ENGLISH, patternString, ParsingUtils.PARSE_FLAG_OPTIMIZE); + ParsedNumber result = new ParsedNumber(); + caseSensitiveParser.parse(inputString, true, result); + assertEquals("Case-Sensitive: " + inputString + " on " + patternString, + expectedCaseSensitiveChars, + result.charEnd); + + NumberParserImpl caseFoldingParser = NumberParserImpl.removeMeWhenMerged(ULocale.ENGLISH, + patternString, + ParsingUtils.PARSE_FLAG_IGNORE_CASE | ParsingUtils.PARSE_FLAG_OPTIMIZE); + result = new ParsedNumber(); + caseFoldingParser.parse(inputString, true, result); + assertEquals("Folded: " + inputString + " on " + patternString, + expectedCaseFoldingChars, + result.charEnd); + } + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java index 016fa581c98..833df6537e9 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java @@ -17,7 +17,7 @@ public class StringSegmentTest { @Test public void testOffset() { - StringSegment segment = new StringSegment(SAMPLE_STRING); + StringSegment segment = new StringSegment(SAMPLE_STRING, 0); assertEquals(0, segment.getOffset()); segment.adjustOffset(3); assertEquals(3, segment.getOffset()); @@ -29,7 +29,7 @@ public class StringSegmentTest { @Test public void testLength() { - StringSegment segment = new StringSegment(SAMPLE_STRING); + StringSegment segment = new StringSegment(SAMPLE_STRING, 0); assertEquals(11, segment.length()); segment.adjustOffset(3); assertEquals(8, segment.length()); @@ -43,7 +43,7 @@ public class StringSegmentTest { @Test public void testCharAt() { - StringSegment segment = new StringSegment(SAMPLE_STRING); + StringSegment segment = new StringSegment(SAMPLE_STRING, 0); assertCharSequenceEquals(SAMPLE_STRING, segment); segment.adjustOffset(3); assertCharSequenceEquals("radio 📻", segment); @@ -53,20 +53,20 @@ public class StringSegmentTest { @Test public void testGetCodePoint() { - StringSegment segment = new StringSegment(SAMPLE_STRING); + StringSegment segment = new StringSegment(SAMPLE_STRING, 0); assertEquals(0x1F4FB, segment.getCodePoint()); segment.setLength(1); - assertEquals(-1, segment.getCodePoint()); + assertEquals(0xD83D, segment.getCodePoint()); segment.resetLength(); segment.adjustOffset(1); - assertEquals(-1, segment.getCodePoint()); + assertEquals(0xDCFB, segment.getCodePoint()); segment.adjustOffset(1); assertEquals(0x20, segment.getCodePoint()); } @Test public void testCommonPrefixLength() { - StringSegment segment = new StringSegment(SAMPLE_STRING); + StringSegment segment = new StringSegment(SAMPLE_STRING, 0); assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING)); assertEquals(4, segment.getCommonPrefixLength("📻 r")); assertEquals(3, segment.getCommonPrefixLength("📻 x")); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index e7995ed3af7..e0aff62172a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -19,6 +19,7 @@ package com.ibm.icu.dev.test.rbbi; import java.text.CharacterIterator; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import org.junit.Test; import org.junit.runner.RunWith; @@ -26,6 +27,7 @@ import org.junit.runners.JUnit4; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RBBIDataWrapper; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.util.ULocale; @@ -562,4 +564,62 @@ public class RBBITest extends TestFmwk { String rtRules = bi.toString(); // getRules() in C++ assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules); } + + @Test + public void TestTableRedundancies() { + RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance(Locale.ENGLISH); + String rules = bi.toString(); + bi = new RuleBasedBreakIterator(rules); + // Build a break iterator from source rules. + // Want to check the rule builder in Java, not the pre-built rules that are imported from ICU4C. + RBBIDataWrapper dw = bi.fRData; + short[] fwtbl = dw.fFTable; + int numCharClasses = dw.fHeader.fCatCount; + + // Check for duplicate columns (character categories) + List columns = new ArrayList(); + for (int column=0; column rows = new ArrayList(); + for (int r=0; r= -1); + s.append(fwtbl[row + RBBIDataWrapper.ACCEPTING]); + s.append(fwtbl[row + RBBIDataWrapper.LOOKAHEAD]); + s.append(fwtbl[row + RBBIDataWrapper.TAGIDX]); + for (int column=0; column