ICU-13513 Improving parser creation time via better static initialization. Finishing...

author Shane Carr <shane@unicode.org>

Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)

committer Shane Carr <shane@unicode.org>

Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
author Shane Carr <shane@unicode.org>
Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
committer Shane Carr <shane@unicode.org>
Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java

index 12ff1e9640e6f534c014290f28aa869a98367057..0c2c38ad5894589b4ff8cc78e56a5ee9a3c1610e 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
@@ -21,10 +21,11 @@ public class DecimalMatcher implements NumberParseMatcher {
      public boolean integerOnly = false;
      public boolean isScientific = false;
  
-    private UnicodeSet groupingUniSet;
-    private UnicodeSet decimalUniSet;
-    private UnicodeSet separatorSet;
-    private String[] digitStrings;
+    private UnicodeSet groupingUniSet = null;
+    private UnicodeSet decimalUniSet = null;
+    private UnicodeSet separatorSet = null;
+    private UnicodeSet separatorLeadChars = null;
+    private String[] digitStrings = null;
      private boolean frozen;
  
      public DecimalMatcher() {
@@ -35,10 +36,59 @@ public class DecimalMatcher implements NumberParseMatcher {
          assert !frozen;
          frozen = true;
  
-        groupingUniSet = SeparatorSetUtils.getGroupingUnicodeSet(symbols, isStrict);
-        decimalUniSet = SeparatorSetUtils.getDecimalUnicodeSet(symbols, isStrict);
-        separatorSet = SeparatorSetUtils.unionUnicodeSets(groupingUniSet, decimalUniSet);
-        digitStrings = symbols.getDigitStringsLocal();
+        String groupingSeparator = symbols.getGroupingSeparatorString();
+        String decimalSeparator = symbols.getDecimalSeparatorString();
+        UnicodeSetStaticCache.Key groupingKey, decimalKey;
+
+        // Attempt to find values in the static cache
+        if (isStrict) {
+            groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                    UnicodeSetStaticCache.Key.OTHER_GROUPING_SEPARATORS,
+                    UnicodeSetStaticCache.Key.STRICT_COMMA_OR_OTHER,
+                    UnicodeSetStaticCache.Key.STRICT_PERIOD_OR_OTHER);
+            decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator,
+                    UnicodeSetStaticCache.Key.STRICT_COMMA,
+                    UnicodeSetStaticCache.Key.STRICT_PERIOD);
+        } else {
+            groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                    UnicodeSetStaticCache.Key.OTHER_GROUPING_SEPARATORS,
+                    UnicodeSetStaticCache.Key.COMMA_OR_OTHER,
+                    UnicodeSetStaticCache.Key.PERIOD_OR_OTHER);
+            decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator,
+                    UnicodeSetStaticCache.Key.COMMA,
+                    UnicodeSetStaticCache.Key.PERIOD);
+        }
+
+        // Get the sets from the static cache if they were found
+        if (groupingKey != null && decimalKey != null) {
+            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+            UnicodeSetStaticCache.Key separatorKey = UnicodeSetStaticCache.unionOf(groupingKey, decimalKey);
+            if (separatorKey != null) {
+                separatorSet = UnicodeSetStaticCache.get(separatorKey);
+                separatorLeadChars = UnicodeSetStaticCache.getLeadChars(separatorKey);
+            }
+        } else if (groupingKey != null) {
+            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+        } else if (decimalKey != null) {
+            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+        }
+
+        // Resolve fallbacks if we don't have sets from the static cache
+        if (groupingUniSet == null) {
+            groupingUniSet = new UnicodeSet().add(groupingSeparator).freeze();
+        }
+        if (decimalUniSet == null) {
+            decimalUniSet = new UnicodeSet().add(decimalSeparator).freeze();
+        }
+        if (separatorSet == null) {
+            separatorSet = new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
+        }
+
+        int cpZero = symbols.getCodePointZero();
+        if (cpZero == -1 || !UCharacter.isDigit(cpZero) || UCharacter.digit(cpZero) != 0) {
+            digitStrings = symbols.getDigitStrings();
+        }
      }
  
      @Override
@@ -74,7 +124,7 @@ public class DecimalMatcher implements NumberParseMatcher {
              }
  
              // Try by digit string.
-            if (digit == -1) {
+            if (digit == -1 && digitStrings != null) {
                  for (int i = 0; i < digitStrings.length; i++) {
                      String str = digitStrings[i];
                      int overlap = segment.getCommonPrefixLength(str);
@@ -190,16 +240,20 @@ public class DecimalMatcher implements NumberParseMatcher {
          return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
      }
  
-    private static final UnicodeSet UNISET_DIGITS = new UnicodeSet("[:digit:]");
-
      @Override
      public UnicodeSet getLeadChars(boolean ignoreCase) {
          UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadSurrogates(UNISET_DIGITS, leadChars);
-        for (int i = 0; i < digitStrings.length; i++) {
-            ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+        leadChars.addAll(UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DIGITS));
+        if (digitStrings != null) {
+            for (int i = 0; i < digitStrings.length; i++) {
+                ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+            }
+        }
+        if (separatorLeadChars != null) {
+            leadChars.addAll(separatorLeadChars);
+        } else {
+            ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
          }
-        ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
          return leadChars.freeze();
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java

index 0e008b4120670931994ddc8ced68a97f719b9c11..610572dea8cf7dfb12519b4c37de361a5551d720 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java
@@ -10,37 +10,32 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class IgnorablesMatcher extends RangeMatcher {
  
-    // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
-    static final UnicodeSet UNISET_BIDI = new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze();
+    public static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
+            UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES));
  
-    // This set was decided after discussion with icu-design@. See ticket #13309.
-    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
-    static final UnicodeSet UNISET_WHITESPACE = new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
-
-    /** The default set of ignorables. */
-    static final UnicodeSet DEFAULT_UNISET = UNISET_BIDI.cloneAsThawed().addAll(UNISET_WHITESPACE).freeze();
-
-    /** The default set of ignorables for strict mode. */
-    static final UnicodeSet STRICT_UNISET = UNISET_BIDI;
-
-    private static final IgnorablesMatcher DEFAULT_INSTANCE = new IgnorablesMatcher(DEFAULT_UNISET);
-    private static final IgnorablesMatcher STRICT_INSTANCE = new IgnorablesMatcher(STRICT_UNISET);
+    public static final IgnorablesMatcher STRICT = new IgnorablesMatcher(
+            UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES));
  
      public static IgnorablesMatcher getInstance(UnicodeSet ignorables) {
          assert ignorables.isFrozen();
-        if (ignorables == DEFAULT_UNISET || ignorables.equals(DEFAULT_UNISET)) {
-            return DEFAULT_INSTANCE;
-        } else if (ignorables == STRICT_UNISET || ignorables.equals(STRICT_UNISET)) {
-            return STRICT_INSTANCE;
-        } else {
-            return new IgnorablesMatcher(ignorables);
-        }
+        return new IgnorablesMatcher(ignorables);
      }
  
      private IgnorablesMatcher(UnicodeSet ignorables) {
          super(ignorables);
      }
  
+    @Override
+    public UnicodeSet getLeadChars(boolean ignoreCase) {
+        if (this == DEFAULT) {
+            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
+        } else if (this == STRICT) {
+            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
+        } else {
+            return super.getLeadChars(ignoreCase);
+        }
+    }
+
      @Override
      protected boolean isDisabled(ParsedNumber result) {
          return false;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java

index e41ffc8c5c46b8e06956815755cfe0cb475ecab8..7d266d73e050e8f0624a1713171cc7ccd4603a62 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
  /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class MinusSignMatcher extends SymbolMatcher {
  
-    public MinusSignMatcher() {
-        // FIXME
-        super("-", new UnicodeSet("[-_]"));
+    private static final MinusSignMatcher DEFAULT = new MinusSignMatcher();
+
+    public static MinusSignMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getMinusSignString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new MinusSignMatcher(symbolString);
+        }
+    }
+
+    private MinusSignMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private MinusSignMatcher() {
+        super(UnicodeSetStaticCache.Key.MINUS_SIGN);
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java

index 6e03beb65dc7aadafc5c4171e370d3156982c93f..40911fa86f6a7a18acc6677c536fa6287b92863b 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
@@ -11,8 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class NanMatcher extends SymbolMatcher {
  
-    public NanMatcher(DecimalFormatSymbols symbols) {
-        super(symbols.getNaN(), UnicodeSet.EMPTY);
+    private static final NanMatcher DEFAULT = new NanMatcher();
+
+    public static NanMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getNaN();
+        if (DEFAULT.string.equals(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new NanMatcher(symbolString);
+        }
+    }
+
+    private NanMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private NanMatcher() {
+        super("NaN", UnicodeSet.EMPTY);
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java

index f115e6eb95fc0e2c3c417671a6449c5499455d1d..2aecf2182b7b9420fddf06a937847fec4d162913 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
@@ -39,14 +39,14 @@ public class NumberParserImpl {
          AffixPatternProvider patternInfo = PatternStringParser.parseToPatternInfo(pattern);
          AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, new UnicodeSet(), true);
  
-        parser.addMatcher(IgnorablesMatcher.getInstance(IgnorablesMatcher.DEFAULT_UNISET));
+        parser.addMatcher(IgnorablesMatcher.DEFAULT);
          DecimalMatcher decimalMatcher = new DecimalMatcher();
          decimalMatcher.requireGroupingMatch = strictGrouping;
          decimalMatcher.grouping1 = 3;
          decimalMatcher.grouping2 = 2;
          decimalMatcher.freeze(symbols, false);
          parser.addMatcher(decimalMatcher);
-        parser.addMatcher(new MinusSignMatcher());
+        parser.addMatcher(MinusSignMatcher.getInstance(symbols));
          parser.addMatcher(new ScientificMatcher(symbols));
          parser.addMatcher(new CurrencyMatcher(locale));
          parser.addMatcher(new RequireNumberMatcher());
@@ -109,7 +109,7 @@ public class NumberParserImpl {
          ULocale locale = symbols.getULocale();
          Currency currency = CustomSymbolCurrency.resolve(properties.getCurrency(), locale, symbols);
          boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
-        UnicodeSet ignorables = isStrict ? IgnorablesMatcher.STRICT_UNISET : IgnorablesMatcher.DEFAULT_UNISET;
+        IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
  
          boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
                  ? (properties.getDecimalSeparatorAlwaysShown() || properties.getMaximumFractionDigits() != 0)
@@ -121,7 +121,7 @@ public class NumberParserImpl {
  
          // Set up a pattern modifier with mostly defaults to generate AffixMatchers.
          AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
-        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, !isStrict);
+        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables.getSet(), !isStrict);
  
          ////////////////////////
          /// CURRENCY MATCHER ///
@@ -135,16 +135,14 @@ public class NumberParserImpl {
          /// OTHER STANDARD MATCHERS ///
          ///////////////////////////////
  
-        if (!isStrict) {
-            parser.addMatcher(IgnorablesMatcher.getInstance(ignorables));
+        parser.addMatcher(ignorables);
+        if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN) || properties.getSignAlwaysShown()) {
+            parser.addMatcher(PlusSignMatcher.getInstance(symbols));
          }
-        if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN)) {
-            parser.addMatcher(new PlusSignMatcher());
-        }
-        parser.addMatcher(new MinusSignMatcher());
-        parser.addMatcher(new NanMatcher(symbols));
-        parser.addMatcher(new PercentMatcher());
-        parser.addMatcher(new PermilleMatcher());
+        parser.addMatcher(MinusSignMatcher.getInstance(symbols));
+        parser.addMatcher(NanMatcher.getInstance(symbols));
+        parser.addMatcher(PercentMatcher.getInstance(symbols));
+        parser.addMatcher(PermilleMatcher.getInstance(symbols));
          DecimalMatcher decimalMatcher = new DecimalMatcher();
          decimalMatcher.requireGroupingMatch = isStrict;
          decimalMatcher.groupingEnabled = properties.getGroupingSize() > 0;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java

index 5dca1c4861725a41a827ad02769659c517ac8060..f97839cac44811f2c952698f7b49878482d4588d 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
  /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class PercentMatcher extends SymbolMatcher {
  
-    public PercentMatcher() {
-        // FIXME
-        super("%", new UnicodeSet("[%]"));
+    private static final PercentMatcher DEFAULT = new PercentMatcher();
+
+    public static PercentMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPercentString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PercentMatcher(symbolString);
+        }
+    }
+
+    private PercentMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PercentMatcher() {
+        super(UnicodeSetStaticCache.Key.PERCENT_SIGN);
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java

index f8ea624078e511f51c2fa9543331c555a34dca85..a03946aa49461e5977b2b79a39ee65866ac3ade3 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
  /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class PermilleMatcher extends SymbolMatcher {
  
-    public PermilleMatcher() {
-        // FIXME
-        super("‰", new UnicodeSet("[‰]"));
+    private static final PermilleMatcher DEFAULT = new PermilleMatcher();
+
+    public static PermilleMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPerMillString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PermilleMatcher(symbolString);
+        }
+    }
+
+    private PermilleMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PermilleMatcher() {
+        super(UnicodeSetStaticCache.Key.PERMILLE_SIGN);
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java

index 254c779bdc6be1c70e0f9fd22d1c3bad445ad23d..d902009f5b9ab303b85d15c520f91794ef816d49 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
  /**
@@ -10,9 +11,23 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class PlusSignMatcher extends SymbolMatcher {
  
-    public PlusSignMatcher() {
-        // FIXME
-        super("+", new UnicodeSet("[+]"));
+    private static final PlusSignMatcher DEFAULT = new PlusSignMatcher();
+
+    public static PlusSignMatcher getInstance(DecimalFormatSymbols symbols) {
+        String symbolString = symbols.getPlusSignString();
+        if (DEFAULT.uniSet.contains(symbolString)) {
+            return DEFAULT;
+        } else {
+            return new PlusSignMatcher(symbolString);
+        }
+    }
+
+    private PlusSignMatcher(String symbolString) {
+        super(symbolString, UnicodeSet.EMPTY);
+    }
+
+    private PlusSignMatcher() {
+        super(UnicodeSetStaticCache.Key.PLUS_SIGN);
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java

index 512e6cf0a7fbdb6b94d9e507d5e7f79b736b101d..8451059359ba0c632a10cfeb5a0de5e6cb512056 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java
@@ -15,6 +15,10 @@ public abstract class RangeMatcher implements NumberParseMatcher {
          this.uniSet = uniSet;
      }
  
+    public UnicodeSet getSet() {
+        return uniSet;
+    }
+
      @Override
      public boolean match(StringSegment segment, ParsedNumber result) {
          // Smoke test first; this matcher might be disabled.
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java

deleted file mode 100644 (file)

index 16b2be1..0000000
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java
+++ /dev/null
@@ -1,109 +0,0 @@
-// © 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-package com.ibm.icu.impl.number.parse;
-
-import com.ibm.icu.text.DecimalFormatSymbols;
-import com.ibm.icu.text.UnicodeSet;
-
-/**
- * @author sffc
- *
- */
-public class SeparatorSetUtils {
-
-    // TODO: Re-generate these sets from the database. They probably haven't been updated in a while.
-
-    static final UnicodeSet COMMA_LIKE = new UnicodeSet(
-            "[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze();
-
-    static final UnicodeSet STRICT_COMMA_LIKE = new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze();
-
-    static final UnicodeSet PERIOD_LIKE = new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze();
-
-    static final UnicodeSet STRICT_PERIOD_LIKE = new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze();
-
-    static final UnicodeSet OTHER_GROUPING_SEPARATORS = new UnicodeSet(
-            "[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]").freeze();
-
-    static final UnicodeSet COMMA_OR_PERIOD_LIKE = new UnicodeSet().addAll(COMMA_LIKE).addAll(PERIOD_LIKE).freeze();
-
-    static final UnicodeSet STRICT_COMMA_OR_PERIOD_LIKE = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(STRICT_PERIOD_LIKE).freeze();
-
-    static final UnicodeSet COMMA_LIKE_OR_OTHER = new UnicodeSet().addAll(COMMA_LIKE).addAll(OTHER_GROUPING_SEPARATORS)
-            .freeze();
-
-    static final UnicodeSet STRICT_COMMA_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet STRICT_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet COMMA_OR_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(COMMA_LIKE).addAll(PERIOD_LIKE)
-            .addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    static final UnicodeSet STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER = new UnicodeSet().addAll(STRICT_COMMA_LIKE)
-            .addAll(STRICT_PERIOD_LIKE).addAll(OTHER_GROUPING_SEPARATORS).freeze();
-
-    public static UnicodeSet getGroupingUnicodeSet(DecimalFormatSymbols symbols, boolean isStrict) {
-        if (isStrict) {
-            return chooseUnicodeSet(symbols.getGroupingSeparatorString(),
-                    STRICT_COMMA_LIKE_OR_OTHER,
-                    STRICT_PERIOD_LIKE_OR_OTHER,
-                    OTHER_GROUPING_SEPARATORS);
-        } else {
-            return chooseUnicodeSet(symbols.getGroupingSeparatorString(),
-                    COMMA_LIKE_OR_OTHER,
-                    PERIOD_LIKE_OR_OTHER,
-                    OTHER_GROUPING_SEPARATORS);
-        }
-    }
-
-    public static UnicodeSet getDecimalUnicodeSet(DecimalFormatSymbols symbols, boolean isStrict) {
-        if (isStrict) {
-            return chooseUnicodeSet(symbols.getDecimalSeparatorString(), STRICT_COMMA_LIKE, STRICT_PERIOD_LIKE);
-        } else {
-            return chooseUnicodeSet(symbols.getDecimalSeparatorString(), COMMA_LIKE, PERIOD_LIKE);
-        }
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1) {
-        return set1.contains(str) ? set1 : new UnicodeSet().add(str).freeze();
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1, UnicodeSet set2) {
-        return set1.contains(str) ? set1 : chooseUnicodeSet(str, set2);
-    }
-
-    private static UnicodeSet chooseUnicodeSet(String str, UnicodeSet set1, UnicodeSet set2, UnicodeSet set3) {
-        return set1.contains(str) ? set1 : chooseUnicodeSet(str, set2, set3);
-    }
-
-    public static UnicodeSet unionUnicodeSets(UnicodeSet set1, UnicodeSet set2) {
-        // Note: == operators should be okay here since non-static UnicodeSets happen only in fallback cases.
-        if (set1 == UnicodeSet.EMPTY && set2 == UnicodeSet.EMPTY) {
-            return UnicodeSet.EMPTY;
-        } else if (set1 == COMMA_LIKE_OR_OTHER && set2 == PERIOD_LIKE_OR_OTHER) {
-            return COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == PERIOD_LIKE_OR_OTHER && set2 == COMMA_LIKE_OR_OTHER) {
-            return COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == STRICT_COMMA_LIKE_OR_OTHER && set2 == STRICT_PERIOD_LIKE_OR_OTHER) {
-            return STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == STRICT_PERIOD_LIKE_OR_OTHER && set2 == STRICT_COMMA_LIKE_OR_OTHER) {
-            return STRICT_COMMA_OR_PERIOD_LIKE_OR_OTHER;
-        } else if (set1 == COMMA_LIKE && set2 == PERIOD_LIKE) {
-            return COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == PERIOD_LIKE && set2 == COMMA_LIKE) {
-            return COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == STRICT_COMMA_LIKE && set2 == STRICT_PERIOD_LIKE) {
-            return STRICT_COMMA_OR_PERIOD_LIKE;
-        } else if (set1 == STRICT_PERIOD_LIKE && set2 == STRICT_COMMA_LIKE) {
-            return STRICT_COMMA_OR_PERIOD_LIKE;
-        } else {
-            return set1.cloneAsThawed().addAll(set2).freeze();
-        }
-    }
-}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java

index 5f5f4f111f82d4eed040773f1471057faedcbfa2..11af03339d19ea96a74fdce349cdaf68cf27c94c 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java
@@ -11,10 +11,21 @@ import com.ibm.icu.text.UnicodeSet;
  public abstract class SymbolMatcher implements NumberParseMatcher {
      protected final String string;
      protected final UnicodeSet uniSet;
+    protected final UnicodeSet leadChars;
+
+    // TODO: Implement this class using only UnicodeSet and not String?
+    // How to deal with case folding?
  
      protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
          string = symbolString;
          uniSet = symbolUniSet;
+        leadChars = null;
+    }
+
+    protected SymbolMatcher(UnicodeSetStaticCache.Key key) {
+        string = "";
+        uniSet = UnicodeSetStaticCache.get(key);
+        leadChars = UnicodeSetStaticCache.getLeadChars(key);
      }
  
      @Override
@@ -30,6 +41,10 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
              accept(segment, result);
              return false;
          }
+
+        if (string.isEmpty()) {
+            return segment.isLeadingSurrogate();
+        }
          int overlap = segment.getCommonPrefixLength(string);
          if (overlap == string.length()) {
              segment.adjustOffset(string.length());
@@ -41,6 +56,10 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
  
      @Override
      public UnicodeSet getLeadChars(boolean ignoreCase) {
+        if (leadChars != null) {
+            return leadChars;
+        }
+
          UnicodeSet leadChars = new UnicodeSet();
          ParsingUtils.putLeadSurrogates(uniSet, leadChars);
          ParsingUtils.putLeadingChar(string, leadChars, ignoreCase);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java

new file mode 100644 (file)

index 0000000..28ab775
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
@@ -0,0 +1,197 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl.number.parse;
+
+import java.util.EnumMap;
+import java.util.Map;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * @author sffc
+ *
+ */
+public class UnicodeSetStaticCache {
+    public static enum Key {
+        // Ignorables
+        BIDI,
+        WHITESPACE,
+        DEFAULT_IGNORABLES,
+        STRICT_IGNORABLES,
+
+        // Separators
+        COMMA,
+        PERIOD,
+        OTHER_GROUPING_SEPARATORS,
+        COMMA_OR_OTHER,
+        PERIOD_OR_OTHER,
+        COMMA_OR_PERIOD_OR_OTHER,
+        STRICT_COMMA,
+        STRICT_PERIOD,
+        STRICT_COMMA_OR_OTHER,
+        STRICT_PERIOD_OR_OTHER,
+        STRICT_COMMA_OR_PERIOD_OR_OTHER,
+
+        // Symbols
+        // TODO: NaN?
+        MINUS_SIGN,
+        PLUS_SIGN,
+        PERCENT_SIGN,
+        PERMILLE_SIGN,
+        INFINITY,
+
+        // Other
+        DIGITS,
+    };
+
+    private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
+    private static final Map<Key, UnicodeSet> leadCharsSets = new EnumMap<Key, UnicodeSet>(Key.class);
+
+    public static UnicodeSet get(Key key) {
+        return unicodeSets.get(key);
+    }
+
+    public static UnicodeSet getLeadChars(Key key) {
+        return leadCharsSets.get(key);
+    }
+
+    public static Key chooseFrom(String str, Key key1) {
+        return get(key1).contains(str) ? key1 : null;
+    }
+
+    public static Key chooseFrom(String str, Key key1, Key key2) {
+        return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
+    }
+
+    public static Key chooseFrom(String str, Key key1, Key key2, Key key3) {
+        return get(key1).contains(str) ? key1 : chooseFrom(str, key2, key3);
+    }
+
+    public static Key unionOf(Key key1, Key key2) {
+        // Make sure key1 < key2
+        if (key2.ordinal() < key1.ordinal()) {
+            Key temp = key1;
+            key1 = key2;
+            key2 = temp;
+        }
+
+        if (key1 == Key.COMMA && key2 == Key.PERIOD_OR_OTHER) {
+            // 1.234,567
+            return Key.COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.COMMA && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // 1'234,567
+            return Key.COMMA_OR_OTHER;
+
+        } else if (key1 == Key.PERIOD && key2 == Key.COMMA_OR_OTHER) {
+            // 1,234.567
+            return Key.COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.PERIOD && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // 1'234.567
+            return Key.PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA && key2 == Key.STRICT_PERIOD_OR_OTHER) {
+            // Strict 1.234,567
+            return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // Strict 1'234,567
+            return Key.STRICT_COMMA_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_PERIOD && key2 == Key.STRICT_COMMA_OR_OTHER) {
+            // Strict 1,234.567
+            return Key.STRICT_COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_PERIOD && key2 == Key.OTHER_GROUPING_SEPARATORS) {
+            // Strict 1'234.567
+            return Key.STRICT_PERIOD_OR_OTHER;
+
+        }
+
+        return null;
+    }
+
+    private static UnicodeSet computeUnion(Key k1, Key k2) {
+        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
+    }
+
+    private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
+        return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
+    }
+
+    static {
+        // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
+        unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze());
+
+        // This set was decided after discussion with icu-design@. See ticket #13309.
+        // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
+        unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze());
+
+        unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE));
+        unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI));
+
+        // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
+        unicodeSets.put(Key.COMMA,
+                new UnicodeSet("[,\\u060C\\u066B\\u3001\\uFE10\\uFE11\\uFE50\\uFE51\\uFF0C\\uFF64]").freeze());
+        unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,\\u066B\\uFE10\\uFE50\\uFF0C]").freeze());
+        unicodeSets.put(Key.PERIOD, new UnicodeSet("[.\\u2024\\u3002\\uFE12\\uFE52\\uFF0E\\uFF61]").freeze());
+        unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.\\u2024\\uFE52\\uFF0E\\uFF61]").freeze());
+        unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
+                new UnicodeSet("[\\ '\\u00A0\\u066C\\u2000-\\u200A\\u2018\\u2019\\u202F\\u205F\\u3000\\uFF07]")
+                        .freeze());
+
+        unicodeSets.put(Key.COMMA_OR_OTHER, computeUnion(Key.COMMA, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+
+        unicodeSets.put(Key.MINUS_SIGN,
+                new UnicodeSet(0x002D,
+                        0x002D,
+                        0x207B,
+                        0x207B,
+                        0x208B,
+                        0x208B,
+                        0x2212,
+                        0x2212,
+                        0x2796,
+                        0x2796,
+                        0xFE63,
+                        0xFE63,
+                        0xFF0D,
+                        0xFF0D).freeze());
+        unicodeSets.put(Key.PLUS_SIGN,
+                new UnicodeSet(0x002B,
+                        0x002B,
+                        0x207A,
+                        0x207A,
+                        0x208A,
+                        0x208A,
+                        0x2795,
+                        0x2795,
+                        0xFB29,
+                        0xFB29,
+                        0xFE62,
+                        0xFE62,
+                        0xFF0B,
+                        0xFF0B).freeze());
+
+        // TODO: Fill in the next three sets.
+        unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
+        unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
+        unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
+
+        unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
+
+        for (Key key : Key.values()) {
+            UnicodeSet leadChars = new UnicodeSet();
+            ParsingUtils.putLeadSurrogates(get(key), leadChars);
+            leadCharsSets.put(key, leadChars.freeze());
+        }
+    }
+}
author	Shane Carr <shane@unicode.org>
	Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
committer	Shane Carr <shane@unicode.org>
	Sat, 16 Dec 2017 10:04:40 +0000 (10:04 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MinusSignMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PercentMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PermilleMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/PlusSignMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SeparatorSetUtils.java	[deleted file]	patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java	[new file with mode: 0644]	patch \| blob