]> granicus.if.org Git - icu/commitdiff
ICU-13513 Improving parser creation time via better static initialization. Finishing...
authorShane Carr <shane@unicode.org>
Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
committerShane Carr <shane@unicode.org>
Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
X-SVN-Rev: 40742

12 files changed:
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java [deleted file]
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java [new file with mode: 0644]

index 12ff1e9640e6f534c014290f28aa869a98367057..0c2c38ad5894589b4ff8cc78e56a5ee9a3c1610e 100644 (file)
@@ -21,10 +21,11 @@ public class DecimalMatcher implements NumberParseMatcher {
     public boolean integerOnly = false;
     public boolean isScientific = false;
 
-    private UnicodeSet groupingUniSet;
-    private UnicodeSet decimalUniSet;
-    private UnicodeSet separatorSet;
-    private String[] digitStrings;
+    private UnicodeSet groupingUniSet = null;
+    private UnicodeSet decimalUniSet = null;
+    private UnicodeSet separatorSet = null;
+    private UnicodeSet separatorLeadChars = null;
+    private String[] digitStrings = null;
     private boolean frozen;
 
     public DecimalMatcher() {
@@ -35,10 +36,59 @@ public class DecimalMatcher implements NumberParseMatcher {
         assert !frozen;
         frozen = true;
 
-        groupingUniSet = SeparatorSetUtils.getGroupingUnicodeSet(symbols, isStrict);
-        decimalUniSet = SeparatorSetUtils.getDecimalUnicodeSet(symbols, isStrict);
-        separatorSet = SeparatorSetUtils.unionUnicodeSets(groupingUniSet, decimalUniSet);
-        digitStrings = symbols.getDigitStringsLocal();
+        String groupingSeparator = symbols.getGroupingSeparatorString();
+        String decimalSeparator = symbols.getDecimalSeparatorString();
+        UnicodeSetStaticCache.Key groupingKey, decimalKey;
+
+        // Attempt to find values in the static cache
+        if (isStrict) {
+            groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                    UnicodeSetStaticCache.Key.OTHER_GROUPING_SEPARATORS,
+                    UnicodeSetStaticCache.Key.STRICT_COMMA_OR_OTHER,
+                    UnicodeSetStaticCache.Key.STRICT_PERIOD_OR_OTHER);
+            decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator,
+                    UnicodeSetStaticCache.Key.STRICT_COMMA,
+                    UnicodeSetStaticCache.Key.STRICT_PERIOD);
+        } else {
+            groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                    UnicodeSetStaticCache.Key.OTHER_GROUPING_SEPARATORS,
+                    UnicodeSetStaticCache.Key.COMMA_OR_OTHER,
+                    UnicodeSetStaticCache.Key.PERIOD_OR_OTHER);
+            decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator,
+                    UnicodeSetStaticCache.Key.COMMA,
+                    UnicodeSetStaticCache.Key.PERIOD);
+        }
+
+        // Get the sets from the static cache if they were found
+        if (groupingKey != null && decimalKey != null) {
+            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+            UnicodeSetStaticCache.Key separatorKey = UnicodeSetStaticCache.unionOf(groupingKey, decimalKey);
+            if (separatorKey != null) {
+                separatorSet = UnicodeSetStaticCache.get(separatorKey);
+                separatorLeadChars = UnicodeSetStaticCache.getLeadChars(separatorKey);
+            }
+        } else if (groupingKey != null) {
+            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+        } else if (decimalKey != null) {
+            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+        }
+
+        // Resolve fallbacks if we don't have sets from the static cache
+        if (groupingUniSet == null) {
+            groupingUniSet = new UnicodeSet().add(groupingSeparator).freeze();
+        }
+        if (decimalUniSet == null) {
+            decimalUniSet = new UnicodeSet().add(decimalSeparator).freeze();
+        }
+        if (separatorSet == null) {
+            separatorSet = new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
+        }
+
+        int cpZero = symbols.getCodePointZero();
+        if (cpZero == -1 || !UCharacter.isDigit(cpZero) || UCharacter.digit(cpZero) != 0) {
+            digitStrings = symbols.getDigitStrings();
+        }
     }
 
     @Override
@@ -74,7 +124,7 @@ public class DecimalMatcher implements NumberParseMatcher {
             }
 
             // Try by digit string.
-            if (digit == -1) {
+            if (digit == -1 && digitStrings != null) {
                 for (int i = 0; i < digitStrings.length; i++) {
                     String str = digitStrings[i];
                     int overlap = segment.getCommonPrefixLength(str);
@@ -190,16 +240,20 @@ public class DecimalMatcher implements NumberParseMatcher {
         return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
     }
 
-    private static final UnicodeSet UNISET_DIGITS = new UnicodeSet("[:digit:]");
-
     @Override
     public UnicodeSet getLeadChars(boolean ignoreCase) {
         UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadSurrogates(UNISET_DIGITS, leadChars);
-        for (int i = 0; i < digitStrings.length; i++) {
-            ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+        leadChars.addAll(UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DIGITS));
+        if (digitStrings != null) {
+            for (int i = 0; i < digitStrings.length; i++) {
+                ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+            }
+        }
+        if (separatorLeadChars != null) {
+            leadChars.addAll(separatorLeadChars);
+        } else {
+            ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
         }
-        ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
         return leadChars.freeze();
     }
 
index 0e008b4120670931994ddc8ced68a97f719b9c11..610572dea8cf7dfb12519b4c37de361a5551d720 100644 (file)
@@ -10,37 +10,32 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class IgnorablesMatcher extends RangeMatcher {
 
-    // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
-    static final UnicodeSet UNISET_BIDI = new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze();
+    public static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
+            UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES));
 
-    // This set was decided after discussion with icu-design@. See ticket #13309.
-    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
-    static final UnicodeSet UNISET_WHITESPACE = new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
-
-    /** The default set of ignorables. */
-    static final UnicodeSet DEFAULT_UNISET = UNISET_BIDI.cloneAsThawed().addAll(UNISET_WHITESPACE).freeze();
-
-    /** The default set of ignorables for strict mode. */
-    static final UnicodeSet STRICT_UNISET = UNISET_BIDI;
-
-    private static final IgnorablesMatcher DEFAULT_INSTANCE = new IgnorablesMatcher(DEFAULT_UNISET);
-    private static final IgnorablesMatcher STRICT_INSTANCE = new IgnorablesMatcher(STRICT_UNISET);
+    public static final IgnorablesMatcher STRICT = new IgnorablesMatcher(
+            UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES));
 
     public static IgnorablesMatcher getInstance(UnicodeSet ignorables) {
         assert ignorables.isFrozen();
-        if (ignorables == DEFAULT_UNISET || ignorables.equals(DEFAULT_UNISET)) {
-            return DEFAULT_INSTANCE;
-        } else if (ignorables == STRICT_UNISET || ignorables.equals(STRICT_UNISET)) {
-            return STRICT_INSTANCE;
-        } else {
-            return new IgnorablesMatcher(ignorables);
-        }
+        return new IgnorablesMatcher(ignorables);
     }
 
     private IgnorablesMatcher(UnicodeSet ignorables) {
         super(ignorables);
     }
 
+    @Override
+    public UnicodeSet getLeadChars(boolean ignoreCase) {
+        if (this == DEFAULT) {
+            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
+        } else if (this == STRICT) {
+            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
+        } else {
+            return super.getLeadChars(ignoreCase);
+        }
+    }
+
     @Override
     protected boolean isDisabled(ParsedNumber result) {
         return false;
index e41ffc8c5c46b8e06956815755cfe0cb475ecab8..7d266d73e050e8f0624a1713171cc7ccd4603a62 100644 (file)
@@ -2,6 +2,7 @@
 // License & terms of use: http://www.unicode.org/copyright.html#License
 package com.ibm.icu.impl.number.parse;
 
+import com.ibm.icu.text.DecimalFormatSymbols;
 import com.ibm.icu.text.UnicodeSet;
 
 /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class MinusSignMatcher extends SymbolMatcher {
 
-    public MinusSignMatcher() {
-        // FIXME
-        super("-", new UnicodeSet("[-_]"));
+    private static final MinusSignMatcher DEFAULT = new MinusSignMatcher();
+
+    public static MinusSignMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getMinusSignString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new MinusSignMatcher(symbolString);
+        }
+    }
+
+    private MinusSignMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private MinusSignMatcher() {
+        super(UnicodeSetStaticCache.Key.MINUS_SIGN);
     }
 
     @Override
index 6e03beb65dc7aadafc5c4171e370d3156982c93f..40911fa86f6a7a18acc6677c536fa6287b92863b 100644 (file)
@@ -11,8 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class NanMatcher extends SymbolMatcher {
 
-    public NanMatcher(DecimalFormatSymbols symbols) {
-        super(symbols.getNaN(), UnicodeSet.EMPTY);
+    private static final NanMatcher DEFAULT = new NanMatcher();
+
+    public static NanMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getNaN();
+        if (DEFAULT.string.equals(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new NanMatcher(symbolString);
+        }
+    }
+
+    private NanMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private NanMatcher() {
+        super("NaN", UnicodeSet.EMPTY);
     }
 
     @Override
index f115e6eb95fc0e2c3c417671a6449c5499455d1d..2aecf2182b7b9420fddf06a937847fec4d162913 100644 (file)
@@ -39,14 +39,14 @@ public class NumberParserImpl {
         AffixPatternProvider patternInfo = PatternStringParser.parseToPatternInfo(pattern);
         AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, new UnicodeSet(), true);
 
-        parser.addMatcher(IgnorablesMatcher.getInstance(IgnorablesMatcher.DEFAULT_UNISET));
+        parser.addMatcher(IgnorablesMatcher.DEFAULT);
         DecimalMatcher decimalMatcher = new DecimalMatcher();
         decimalMatcher.requireGroupingMatch = strictGrouping;
         decimalMatcher.grouping1 = 3;
         decimalMatcher.grouping2 = 2;
         decimalMatcher.freeze(symbols, false);
         parser.addMatcher(decimalMatcher);
-        parser.addMatcher(new MinusSignMatcher());
+        parser.addMatcher(MinusSignMatcher.getInstance(symbols));
         parser.addMatcher(new ScientificMatcher(symbols));
         parser.addMatcher(new CurrencyMatcher(locale));
         parser.addMatcher(new RequireNumberMatcher());
@@ -109,7 +109,7 @@ public class NumberParserImpl {
         ULocale locale = symbols.getULocale();
         Currency currency = CustomSymbolCurrency.resolve(properties.getCurrency(), locale, symbols);
         boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
-        UnicodeSet ignorables = isStrict ? IgnorablesMatcher.STRICT_UNISET : IgnorablesMatcher.DEFAULT_UNISET;
+        IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
 
         boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
                 ? (properties.getDecimalSeparatorAlwaysShown() || properties.getMaximumFractionDigits() != 0)
@@ -121,7 +121,7 @@ public class NumberParserImpl {
 
         // Set up a pattern modifier with mostly defaults to generate AffixMatchers.
         AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
-        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, !isStrict);
+        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables.getSet(), !isStrict);
 
         ////////////////////////
         /// CURRENCY MATCHER ///
@@ -135,16 +135,14 @@ public class NumberParserImpl {
         /// OTHER STANDARD MATCHERS ///
         ///////////////////////////////
 
-        if (!isStrict) {
-            parser.addMatcher(IgnorablesMatcher.getInstance(ignorables));
+        parser.addMatcher(ignorables);
+        if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN) || properties.getSignAlwaysShown()) {
+            parser.addMatcher(PlusSignMatcher.getInstance(symbols));
         }
-        if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN)) {
-            parser.addMatcher(new PlusSignMatcher());
-        }
-        parser.addMatcher(new MinusSignMatcher());
-        parser.addMatcher(new NanMatcher(symbols));
-        parser.addMatcher(new PercentMatcher());
-        parser.addMatcher(new PermilleMatcher());
+        parser.addMatcher(MinusSignMatcher.getInstance(symbols));
+        parser.addMatcher(NanMatcher.getInstance(symbols));
+        parser.addMatcher(PercentMatcher.getInstance(symbols));
+        parser.addMatcher(PermilleMatcher.getInstance(symbols));
         DecimalMatcher decimalMatcher = new DecimalMatcher();
         decimalMatcher.requireGroupingMatch = isStrict;
         decimalMatcher.groupingEnabled = properties.getGroupingSize() > 0;
index 5dca1c4861725a41a827ad02769659c517ac8060..f97839cac44811f2c952698f7b49878482d4588d 100644 (file)
@@ -2,6 +2,7 @@
 // License & terms of use: http://www.unicode.org/copyright.html#License
 package com.ibm.icu.impl.number.parse;
 
+import com.ibm.icu.text.DecimalFormatSymbols;
 import com.ibm.icu.text.UnicodeSet;
 
 /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class PercentMatcher extends SymbolMatcher {
 
-    public PercentMatcher() {
-        // FIXME
-        super("%", new UnicodeSet("[%]"));
+    private static final PercentMatcher DEFAULT = new PercentMatcher();
+
+    public static PercentMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPercentString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PercentMatcher(symbolString);
+        }
+    }
+
+    private PercentMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PercentMatcher() {
+        super(UnicodeSetStaticCache.Key.PERCENT_SIGN);
     }
 
     @Override
index f8ea624078e511f51c2fa9543331c555a34dca85..a03946aa49461e5977b2b79a39ee65866ac3ade3 100644 (file)
@@ -2,6 +2,7 @@
 // License & terms of use: http://www.unicode.org/copyright.html#License
 package com.ibm.icu.impl.number.parse;
 
+import com.ibm.icu.text.DecimalFormatSymbols;
 import com.ibm.icu.text.UnicodeSet;
 
 /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class PermilleMatcher extends SymbolMatcher {
 
-    public PermilleMatcher() {
-        // FIXME
-        super("‰", new UnicodeSet("[‰]"));
+    private static final PermilleMatcher DEFAULT = new PermilleMatcher();
+
+    public static PermilleMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPerMillString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PermilleMatcher(symbolString);
+        }
+    }
+
+    private PermilleMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PermilleMatcher() {
+        super(UnicodeSetStaticCache.Key.PERMILLE_SIGN);
     }
 
     @Override
index 254c779bdc6be1c70e0f9fd22d1c3bad445ad23d..d902009f5b9ab303b85d15c520f91794ef816d49 100644 (file)
@@ -2,6 +2,7 @@
 // License & terms of use: http://www.unicode.org/copyright.html#License
 package com.ibm.icu.impl.number.parse;
 
+import com.ibm.icu.text.DecimalFormatSymbols;
 import com.ibm.icu.text.UnicodeSet;
 
 /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
  */
 public class PlusSignMatcher extends SymbolMatcher {
 
-    public PlusSignMatcher() {
-        // FIXME
-        super("+", new UnicodeSet("[+]"));
+    private static final PlusSignMatcher DEFAULT = new PlusSignMatcher();
+
+    public static PlusSignMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPlusSignString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PlusSignMatcher(symbolString);
+        }
+    }
+
+    private PlusSignMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PlusSignMatcher() {
+        super(UnicodeSetStaticCache.Key.PLUS_SIGN);
     }
 
     @Override
index 512e6cf0a7fbdb6b94d9e507d5e7f79b736b101d..8451059359ba0c632a10cfeb5a0de5e6cb512056 100644 (file)
@@ -15,6 +15,10 @@ public abstract class RangeMatcher implements NumberParseMatcher {
         this.uniSet = uniSet;
     }
 
+    public UnicodeSet getSet() {
+        return uniSet;
+    }
+
     @Override
     public boolean match(StringSegment segment, ParsedNumber result) {
         // Smoke test first; this matcher might be disabled.
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java
deleted file mode 100644 (file)
index 16b2be1..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-// © 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-package com.ibm.icu.impl.number.parse;
-
-import com.ibm.icu.text.DecimalFormatSymbols;
-import com.ibm.icu.text.UnicodeSet;
-
-/**
- * @author sffc
- *
- */
-public class SeparatorSetUtils {
-
-    // TODO: Re-generate these sets from the database. They probably haven't been updated in a while.
-
-    static final UnicodeSet COMMA_LIKE = new UnicodeSet(
-            "[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze();
-
-    static final UnicodeSet STRICT_COMMA_LIKE = new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze();
-
-    static final UnicodeSet PERIOD_LIKE = new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze();
-
-    static final UnicodeSet STRICT_PERIOD_LIKE = new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze();
-
-    static final UnicodeSet OTHER_GROUPING_SEPARATORS = new UnicodeSet(
-            "[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]").freeze();
-
-    static final UnicodeSet COMMA_OR_PERIOD_LIKE = new UnicodeSet().addAll(COMMA_LIKE).addAll(PERIOD_LIKE).freeze();
-
-    static final UnicodeSet STRICT_COMMA_OR_PERIOD_LIKE = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(STRICT_PERIOD_LIKE).freeze();
-
-    static final UnicodeSet COMMA_LIKE_OR_OTHER = new UnicodeSet().addAll(COMMA_LIKE).addAll(OTHER_GROUPING_SEPARATORS)
-            .freeze();
-
-    static final UnicodeSet STRICT_COMMA_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet STRICT_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet COMMA_OR_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(COMMA_LIKE).addAll(PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(STRICT_PERIOD_LIKE).addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    public static UnicodeSet getGroupingUnicodeSet(DecimalFormatSymbols symbols, boolean isStrict) {
-        if (isStrict) {
-            return chooseUnicodeSet(symbols.getGroupingSeparatorString(),
-                    STRICT_COMMA_LIKE_OR_OTHER,
-                    STRICT_PERIOD_LIKE_OR_OTHER,
-                    OTHER_GROUPING_SEPARATORS);
-        } else {
-            return chooseUnicodeSet(symbols.getGroupingSeparatorString(),
-                    COMMA_LIKE_OR_OTHER,
-                    PERIOD_LIKE_OR_OTHER,
-                    OTHER_GROUPING_SEPARATORS);
-        }
-    }
-
-    public static UnicodeSet getDecimalUnicodeSet(DecimalFormatSymbols symbols, boolean isStrict) {
-        if (isStrict) {
-            return chooseUnicodeSet(symbols.getDecimalSeparatorString(), STRICT_COMMA_LIKE, STRICT_PERIOD_LIKE);
-        } else {
-            return chooseUnicodeSet(symbols.getDecimalSeparatorString(), COMMA_LIKE, PERIOD_LIKE);
-        }
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1) {
-        return set1.contains(str) ? set1 : new UnicodeSet().add(str).freeze();
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1, UnicodeSet set2) {
-        return set1.contains(str) ? set1 : chooseUnicodeSet(str, set2);
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1, UnicodeSet set2, UnicodeSet set3) {
-        return set1.contains(str) ? set1 : chooseUnicodeSet(str, set2, set3);
-    }
-
-    public static UnicodeSet unionUnicodeSets(UnicodeSet set1, UnicodeSet set2) {
-        // Note: == operators should be okay here since non-static UnicodeSets happen only in fallback cases.
-        if (set1 == UnicodeSet.EMPTY && set2 == UnicodeSet.EMPTY) {
-            return UnicodeSet.EMPTY;
-        } else if (set1 == COMMA_LIKE_OR_OTHER && set2 == PERIOD_LIKE_OR_OTHER) {
-            return COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == PERIOD_LIKE_OR_OTHER && set2 == COMMA_LIKE_OR_OTHER) {
-            return COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == STRICT_COMMA_LIKE_OR_OTHER && set2 == STRICT_PERIOD_LIKE_OR_OTHER) {
-            return STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == STRICT_PERIOD_LIKE_OR_OTHER && set2 == STRICT_COMMA_LIKE_OR_OTHER) {
-            return STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == COMMA_LIKE && set2 == PERIOD_LIKE) {
-            return COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == PERIOD_LIKE && set2 == COMMA_LIKE) {
-            return COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == STRICT_COMMA_LIKE && set2 == STRICT_PERIOD_LIKE) {
-            return STRICT_COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == STRICT_PERIOD_LIKE && set2 == STRICT_COMMA_LIKE) {
-            return STRICT_COMMA_OR_PERIOD_LIKE;
-        } else {
-            return set1.cloneAsThawed().addAll(set2).freeze();
-        }
-    }
-}
index 5f5f4f111f82d4eed040773f1471057faedcbfa2..11af03339d19ea96a74fdce349cdaf68cf27c94c 100644 (file)
@@ -11,10 +11,21 @@ import com.ibm.icu.text.UnicodeSet;
 public abstract class SymbolMatcher implements NumberParseMatcher {
     protected final String string;
     protected final UnicodeSet uniSet;
+    protected final UnicodeSet leadChars;
+
+    // TODO: Implement this class using only UnicodeSet and not String?
+    // How to deal with case folding?
 
     protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
         string = symbolString;
         uniSet = symbolUniSet;
+        leadChars = null;
+    }
+
+    protected SymbolMatcher(UnicodeSetStaticCache.Key key) {
+        string = "";
+        uniSet = UnicodeSetStaticCache.get(key);
+        leadChars = UnicodeSetStaticCache.getLeadChars(key);
     }
 
     @Override
@@ -30,6 +41,10 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
             accept(segment, result);
             return false;
         }
+
+        if (string.isEmpty()) {
+            return segment.isLeadingSurrogate();
+        }
         int overlap = segment.getCommonPrefixLength(string);
         if (overlap == string.length()) {
             segment.adjustOffset(string.length());
@@ -41,6 +56,10 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
 
     @Override
     public UnicodeSet getLeadChars(boolean ignoreCase) {
+        if (leadChars != null) {
+            return leadChars;
+        }
+
         UnicodeSet leadChars = new UnicodeSet();
         ParsingUtils.putLeadSurrogates(uniSet, leadChars);
         ParsingUtils.putLeadingChar(string, leadChars, ignoreCase);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
new file mode 100644 (file)
index 0000000..28ab775
--- /dev/null
@@ -0,0 +1,197 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.util.EnumMap;
+import java.util.Map;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public class UnicodeSetStaticCache {
+    public static enum Key {
+        // Ignorables
+        BIDI,
+        WHITESPACE,
+        DEFAULT_IGNORABLES,
+        STRICT_IGNORABLES,
+
+        // Separators
+        COMMA,
+        PERIOD,
+        OTHER_GROUPING_SEPARATORS,
+        COMMA_OR_OTHER,
+        PERIOD_OR_OTHER,
+        COMMA_OR_PERIOD_OR_OTHER,
+        STRICT_COMMA,
+        STRICT_PERIOD,
+        STRICT_COMMA_OR_OTHER,
+        STRICT_PERIOD_OR_OTHER,
+        STRICT_COMMA_OR_PERIOD_OR_OTHER,
+
+        // Symbols
+        // TODO: NaN?
+        MINUS_SIGN,
+        PLUS_SIGN,
+        PERCENT_SIGN,
+        PERMILLE_SIGN,
+        INFINITY,
+
+        // Other
+        DIGITS,
+    };
+
+    private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
+    private static final Map<Key, UnicodeSet> leadCharsSets = new EnumMap<Key, UnicodeSet>(Key.class);
+
+    public static UnicodeSet get(Key key) {
+        return unicodeSets.get(key);
+    }
+
+    public static UnicodeSet getLeadChars(Key key) {
+        return leadCharsSets.get(key);
+    }
+
+    public static Key chooseFrom(String str, Key key1) {
+        return get(key1).contains(str) ? key1 : null;
+    }
+
+    public static Key chooseFrom(String str, Key key1, Key key2) {
+        return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
+    }
+
+    public static Key chooseFrom(String str, Key key1, Key key2, Key key3) {
+        return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3);
+    }
+
+    public static Key unionOf(Key key1, Key key2) {
+        // Make sure key1 < key2
+        if (key2.ordinal() < key1.ordinal()) {
+            Key temp = key1;
+            key1 = key2;
+            key2 = temp;
+        }
+
+        if (key1 == Key.COMMA && key2 == Key.PERIOD_OR_OTHER) {
+            // 1.234,567
+            return Key.COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.COMMA && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // 1'234,567
+            return Key.COMMA_OR_OTHER;
+
+        } else if (key1 == Key.PERIOD && key2 == Key.COMMA_OR_OTHER) {
+            // 1,234.567
+            return Key.COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.PERIOD && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // 1'234.567
+            return Key.PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA && key2 == Key.STRICT_PERIOD_OR_OTHER) {
+            // Strict 1.234,567
+            return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // Strict 1'234,567
+            return Key.STRICT_COMMA_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_PERIOD && key2 == Key.STRICT_COMMA_OR_OTHER) {
+            // Strict 1,234.567
+            return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_PERIOD && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // Strict 1'234.567
+            return Key.STRICT_PERIOD_OR_OTHER;
+
+        }
+
+        return null;
+    }
+
+    private static UnicodeSet computeUnion(Key k1, Key k2) {
+        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
+    }
+
+    private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
+        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
+    }
+
+    static {
+        // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
+        unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze());
+
+        // This set was decided after discussion with icu-design@. See ticket #13309.
+        // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
+        unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze());
+
+        unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE));
+        unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI));
+
+        // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
+        unicodeSets.put(Key.COMMA,
+                new UnicodeSet("[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze());
+        unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze());
+        unicodeSets.put(Key.PERIOD, new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze());
+        unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze());
+        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
+                new UnicodeSet("[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]")
+                        .freeze());
+
+        unicodeSets.put(Key.COMMA_OR_OTHER, computeUnion(Key.COMMA, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+
+        unicodeSets.put(Key.MINUS_SIGN,
+                new UnicodeSet(0x002D,
+                        0x002D,
+                        0x207B,
+                        0x207B,
+                        0x208B,
+                        0x208B,
+                        0x2212,
+                        0x2212,
+                        0x2796,
+                        0x2796,
+                        0xFE63,
+                        0xFE63,
+                        0xFF0D,
+                        0xFF0D).freeze());
+        unicodeSets.put(Key.PLUS_SIGN,
+                new UnicodeSet(0x002B,
+                        0x002B,
+                        0x207A,
+                        0x207A,
+                        0x208A,
+                        0x208A,
+                        0x2795,
+                        0x2795,
+                        0xFB29,
+                        0xFB29,
+                        0xFE62,
+                        0xFE62,
+                        0xFF0B,
+                        0xFF0B).freeze());
+
+        // TODO: Fill in the next three sets.
+        unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
+        unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
+        unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
+
+        unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
+
+        for (Key key : Key.values()) {
+            UnicodeSet leadChars = new UnicodeSet();
+            ParsingUtils.putLeadSurrogates(get(key), leadChars);
+            leadCharsSets.put(key, leadChars.freeze());
+        }
+    }
+}