ICU-13513 Integrating some of Andy's feedback. Moving code unit vs. code point logic...

author Shane Carr <shane@unicode.org>

Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)

committer Shane Carr <shane@unicode.org>

Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)
author Shane Carr <shane@unicode.org>
Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)
committer Shane Carr <shane@unicode.org>
Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/TextTrieMap.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/TextTrieMap.java

index 71662ddfd2b4b78e21d38e8c8442d5999a68b965..b178f3a79c7d80470e8fd9473863c1fa0301579a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/TextTrieMap.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/TextTrieMap.java
@@ -90,11 +90,11 @@ public class TextTrieMap<V> {
      }
  
      public void find(CharSequence text, ResultHandler<V> handler) {
-        find(text, 0, handler, new Output());
+        find(text, 0, handler, null);
      }
  
      public void find(CharSequence text, int offset, ResultHandler<V> handler) {
-        find(text, offset, handler, new Output());
+        find(text, offset, handler, null);
      }
  
      private void find(CharSequence text, int offset, ResultHandler<V> handler, Output output) {
@@ -116,8 +116,8 @@ public class TextTrieMap<V> {
          }
      }
  
-    public void putLeadChars(UnicodeSet output) {
-        _root.putLeadChars(output);
+    public void putLeadCodePoints(UnicodeSet output) {
+        _root.putLeadCodePoints(output);
      }
  
      /**
@@ -363,7 +363,9 @@ public class TextTrieMap<V> {
                  return null;
              }
              if (!chitr.hasNext()) {
-                output.partialMatch = true;
+                if (output != null) {
+                    output.partialMatch = true;
+                }
                  return null;
              }
              Node match = null;
@@ -382,12 +384,24 @@ public class TextTrieMap<V> {
              return match;
          }
  
-        public void putLeadChars(UnicodeSet output) {
+        public void putLeadCodePoints(UnicodeSet output) {
              if (_children == null) {
                  return;
              }
              for (Node child : _children) {
-                output.add(child._text[0]);
+                char c0 = child._text[0];
+                if (!UCharacter.isHighSurrogate(c0)) {
+                    output.add(c0);
+                } else if (child.charCount() >= 2) {
+                    output.add(Character.codePointAt(child._text, 0));
+                } else if (child._children != null) {
+                    // Construct all possible code points from grandchildren.
+                    for (Node grandchild : child._children) {
+                        char c1 = grandchild._text[0];
+                        int cp = Character.toCodePoint(c0, c1);
+                        output.add(cp);
+                    }
+                }
              }
          }
  
@@ -465,7 +479,9 @@ public class TextTrieMap<V> {
              int idx = 1;
              while (idx < _text.length) {
                  if(!chitr.hasNext()) {
-                    output.partialMatch = true;
+                    if (output != null) {
+                        output.partialMatch = true;
+                    }
                      matched = false;
                      break;
                  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java

index e8385ca715826d871067dc1feb10a962d146b650..a0d1ba05887a3ca1328ce3bbbeb4db0fe6028bd6 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java
@@ -44,7 +44,7 @@ public class AffixMatcher implements NumberParseMatcher {
              AffixPatternProvider patternInfo,
              NumberParserImpl output,
              IgnorablesMatcher ignorables,
-            boolean includeUnpaired) {
+            int parseFlags) {
          // Lazy-initialize the StringBuilder.
          StringBuilder sb = null;
  
@@ -53,9 +53,11 @@ public class AffixMatcher implements NumberParseMatcher {
          ArrayList<AffixMatcher> matchers = new ArrayList<AffixMatcher>(6);
  
          sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_PREFIX, ignorables.getSet(), sb);
-        String posPrefix = toStringOrEmpty(sb);
+        String posPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
          sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_POS_SUFFIX, ignorables.getSet(), sb);
-        String posSuffix = toStringOrEmpty(sb);
+        String posSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
+
+        boolean includeUnpaired = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
  
          if (!posPrefix.isEmpty() || !posSuffix.isEmpty()) {
              matchers.add(getInstance(posPrefix, posSuffix, 0));
@@ -67,9 +69,9 @@ public class AffixMatcher implements NumberParseMatcher {
  
          if (patternInfo.hasNegativeSubpattern()) {
              sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_PREFIX, ignorables.getSet(), sb);
-            String negPrefix = toStringOrEmpty(sb);
+            String negPrefix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
              sb = getCleanAffix(patternInfo, AffixPatternProvider.FLAG_NEG_SUFFIX, ignorables.getSet(), sb);
-            String negSuffix = toStringOrEmpty(sb);
+            String negSuffix = ParsingUtils.maybeFold(toStringOrEmpty(sb), parseFlags);
  
              if (negPrefix.equals(posPrefix) && negSuffix.equals(posSuffix)) {
                  // No-op: favor the positive AffixMatcher
@@ -115,6 +117,8 @@ public class AffixMatcher implements NumberParseMatcher {
      }
  
      private AffixMatcher(String prefix, String suffix, int flags) {
+        assert prefix != null;
+        assert suffix != null;
          this.prefix = prefix;
          this.suffix = suffix;
          this.flags = flags;
@@ -157,11 +161,11 @@ public class AffixMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadingChar(prefix, leadChars, ignoreCase);
-        ParsingUtils.putLeadingChar(suffix, leadChars, ignoreCase);
-        return leadChars.freeze();
+    public UnicodeSet getLeadCodePoints() {
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        ParsingUtils.putLeadCodePoint(prefix, leadCodePoints);
+        ParsingUtils.putLeadCodePoint(suffix, leadCodePoints);
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java

index da8d9ddc53fc2662de4a1226be80518dfff13a44..222f26c0944336b772d20e7d5e92488af48497b4 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java
@@ -15,14 +15,16 @@ public class CurrencyMatcher implements NumberParseMatcher {
      private final String currency1;
      private final String currency2;
  
-    public static NumberParseMatcher getInstance(Currency currency, ULocale loc) {
-        return new CurrencyMatcher(currency, loc);
+    public static NumberParseMatcher getInstance(Currency currency, ULocale loc, int setupFlags) {
+        return new CurrencyMatcher(currency.getSubtype(),
+                ParsingUtils.maybeFold(currency.getSymbol(loc), setupFlags),
+                ParsingUtils.maybeFold(currency.getCurrencyCode(), setupFlags));
      }
  
-    private CurrencyMatcher(Currency currency, ULocale loc) {
-        isoCode = currency.getSubtype();
-        currency1 = currency.getSymbol(loc);
-        currency2 = currency.getCurrencyCode();
+    private CurrencyMatcher(String isoCode, String currency1, String currency2) {
+        this.isoCode = isoCode;
+        this.currency1 = currency1;
+        this.currency2 = currency2;
      }
  
      @Override
@@ -49,11 +51,11 @@ public class CurrencyMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadingChar(currency1, leadChars, ignoreCase);
-        ParsingUtils.putLeadingChar(currency2, leadChars, ignoreCase);
-        return leadChars.freeze();
+    public UnicodeSet getLeadCodePoints() {
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        ParsingUtils.putLeadCodePoint(currency1, leadCodePoints);
+        ParsingUtils.putLeadCodePoint(currency2, leadCodePoints);
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java

index 3e194f657f20c2a60955c5c7304d26fddb3e17ec..b7bf734678d6a94fac68084ad0c91885e7c1d426 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java
@@ -25,6 +25,8 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
      }
  
      private CurrencyTrieMatcher(ULocale locale) {
+        // TODO: Currency trie does not currently have an option for case folding.  It defaults to use
+        // case folding on long-names but not symbols.
          longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
          symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
      }
@@ -49,11 +51,11 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        longNameTrie.putLeadChars(leadChars);
-        symbolTrie.putLeadChars(leadChars);
-        return leadChars.freeze();
+    public UnicodeSet getLeadCodePoints() {
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        longNameTrie.putLeadCodePoints(leadCodePoints);
+        symbolTrie.putLeadCodePoints(leadCodePoints);
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java

index e7f7f0730f3a349f1f7729debc92e799b342b0c0..d041f0bbc09319cccde1c2d129f7d8f22c716892 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
@@ -5,6 +5,7 @@ package com.ibm.icu.impl.number.parse;
  import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
  import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
  import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.number.Grouper;
  import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
@@ -14,48 +15,57 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class DecimalMatcher implements NumberParseMatcher {
  
-    public boolean requireGroupingMatch = false;
-    public boolean decimalEnabled = true;
-    public boolean groupingEnabled = true;
-    public int grouping1 = 3;
-    public int grouping2 = 3;
-    public boolean integerOnly = false;
-    public boolean isScientific = false;
+    private final boolean requireGroupingMatch;
+    private final boolean groupingDisabled;
+    private final int grouping1;
+    private final int grouping2;
+    private final boolean integerOnly;
+    private final boolean isScientific;
  
-    private UnicodeSet groupingUniSet = null;
-    private UnicodeSet decimalUniSet = null;
-    private UnicodeSet separatorSet = null;
-    private UnicodeSet separatorLeadChars = null;
-    private String[] digitStrings = null;
-    private boolean frozen;
+    // Assumption: these sets all consist of single code points. If this assumption needs to be broken,
+    // fix getLeadCodePoints() as well as matching logic. Be careful of the performance impact.
+    private final UnicodeSet groupingUniSet;
+    private final UnicodeSet decimalUniSet;
+    private final UnicodeSet separatorSet;
+    private final UnicodeSet leadSet;
+    private final String[] digitStrings;
  
-    public DecimalMatcher() {
-        frozen = false;
+    public static DecimalMatcher getInstance(
+            DecimalFormatSymbols symbols,
+            Grouper grouper,
+            int parseFlags) {
+        // TODO: Cache popular instances?
+        return new DecimalMatcher(symbols, grouper, parseFlags);
      }
  
-    public void freeze(DecimalFormatSymbols symbols, boolean monetarySeparators, boolean isStrict) {
-        assert !frozen;
-        frozen = true;
-
-        String groupingSeparator = monetarySeparators ? symbols.getMonetaryGroupingSeparatorString()
-                : symbols.getGroupingSeparatorString();
-        String decimalSeparator = monetarySeparators ? symbols.getMonetaryDecimalSeparatorString()
-                : symbols.getDecimalSeparatorString();
+    private DecimalMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
          Key groupingKey, decimalKey;
+        String groupingSeparator, decimalSeparator;
+        if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS)) {
+            groupingSeparator = symbols.getMonetaryGroupingSeparatorString();
+            decimalSeparator = symbols.getMonetaryDecimalSeparatorString();
+        } else {
+            groupingSeparator = symbols.getGroupingSeparatorString();
+            decimalSeparator = symbols.getDecimalSeparatorString();
+        }
  
          // Attempt to find values in the static cache
-        if (isStrict) {
-            decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
+        if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_SEPARATORS)) {
+            decimalKey = UnicodeSetStaticCache
+                    .chooseFrom(decimalSeparator, Key.STRICT_COMMA, Key.STRICT_PERIOD);
              if (decimalKey == Key.STRICT_COMMA) {
                  // Decimal is comma; grouping should be period or custom
-                groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_PERIOD_OR_OTHER);
+                groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                        Key.STRICT_PERIOD_OR_OTHER);
              } else if (decimalKey == Key.STRICT_PERIOD) {
                  // Decimal is period; grouping should be comma or custom
-                groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER);
+                groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                        Key.STRICT_COMMA_OR_OTHER);
              } else {
                  // Decimal is custom; grouping can be either comma or period or custom
-                groupingKey = UnicodeSetStaticCache
-                        .chooseFrom(groupingSeparator, Key.STRICT_COMMA_OR_OTHER, Key.STRICT_PERIOD_OR_OTHER);
+                groupingKey = UnicodeSetStaticCache.chooseFrom(groupingSeparator,
+                        Key.STRICT_COMMA_OR_OTHER,
+                        Key.STRICT_PERIOD_OR_OTHER);
              }
          } else {
              decimalKey = UnicodeSetStaticCache.chooseFrom(decimalSeparator, Key.COMMA, Key.PERIOD);
@@ -73,35 +83,46 @@ public class DecimalMatcher implements NumberParseMatcher {
          }
  
          // Get the sets from the static cache if they were found
+        UnicodeSet _groupingUniSet = null, _decimalUniSet = null, _separatorSet = null, _leadSet = null;
          if (groupingKey != null && decimalKey != null) {
-            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
-            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+            _groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+            _decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
              Key separatorKey = UnicodeSetStaticCache.unionOf(groupingKey, decimalKey);
              if (separatorKey != null) {
-                separatorSet = UnicodeSetStaticCache.get(separatorKey);
-                separatorLeadChars = UnicodeSetStaticCache.getLeadChars(separatorKey);
+                _separatorSet = UnicodeSetStaticCache.get(separatorKey);
+                Key leadKey = UnicodeSetStaticCache.unionOf(Key.DIGITS, separatorKey);
+                if (leadKey != null) {
+                    _leadSet = UnicodeSetStaticCache.get(leadKey);
+                }
              }
          } else if (groupingKey != null) {
-            groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
+            _groupingUniSet = UnicodeSetStaticCache.get(groupingKey);
          } else if (decimalKey != null) {
-            decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
+            _decimalUniSet = UnicodeSetStaticCache.get(decimalKey);
          }
  
-        // Resolve fallbacks if we don't have sets from the static cache
-        if (groupingUniSet == null) {
-            groupingUniSet = new UnicodeSet().add(groupingSeparator).freeze();
-        }
-        if (decimalUniSet == null) {
-            decimalUniSet = new UnicodeSet().add(decimalSeparator).freeze();
-        }
-        if (separatorSet == null) {
-            separatorSet = new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
-        }
+        // Finish resolving fallbacks
+        groupingUniSet = _groupingUniSet != null ? _groupingUniSet
+                : new UnicodeSet().add(groupingSeparator.codePointAt(0)).freeze();
+        decimalUniSet = _decimalUniSet != null ? _decimalUniSet
+                : new UnicodeSet().add(decimalSeparator.codePointAt(0)).freeze();
+        separatorSet = _separatorSet != null ? _separatorSet
+                : new UnicodeSet().addAll(groupingUniSet).addAll(decimalUniSet).freeze();
+        leadSet = _leadSet; // null if not available
  
          int cpZero = symbols.getCodePointZero();
          if (cpZero == -1 || !UCharacter.isDigit(cpZero) || UCharacter.digit(cpZero) != 0) {
-            digitStrings = symbols.getDigitStrings();
+            digitStrings = symbols.getDigitStringsLocal();
+        } else {
+            digitStrings = null;
          }
+
+        requireGroupingMatch = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE);
+        groupingDisabled = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_GROUPING_DISABLED);
+        grouping1 = grouper.getPrimary();
+        grouping2 = grouper.getSecondary();
+        integerOnly = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
+        isScientific = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC);
      }
  
      @Override
@@ -110,7 +131,6 @@ public class DecimalMatcher implements NumberParseMatcher {
      }
  
      public boolean match(StringSegment segment, ParsedNumber result, boolean negativeExponent) {
-        assert frozen;
          if (result.seenNumber() && !isScientific) {
              // A number has already been consumed.
              return false;
@@ -177,16 +197,18 @@ public class DecimalMatcher implements NumberParseMatcher {
                  if (separator == -1) {
                      // First separator; could be either grouping or decimal.
                      separator = cp;
-                    if (groupingEnabled && requireGroupingMatch && groupingUniSet.contains(cp)
+                    if (!groupingDisabled
+                            && requireGroupingMatch
+                            && groupingUniSet.contains(cp)
                              && (currGroup == 0 || currGroup > grouping2)) {
                          break;
                      }
-                } else if (groupingEnabled && separator == cp && groupingUniSet.contains(cp)) {
+                } else if (!groupingDisabled && separator == cp && groupingUniSet.contains(cp)) {
                      // Second or later grouping separator.
                      if (requireGroupingMatch && currGroup != grouping2) {
                          break;
                      }
-                } else if (groupingEnabled && separator != cp && decimalUniSet.contains(cp)) {
+                } else if (!groupingDisabled && separator != cp && decimalUniSet.contains(cp)) {
                      // Decimal separator after a grouping separator.
                      if (requireGroupingMatch && currGroup != grouping1) {
                          break;
@@ -234,13 +256,15 @@ public class DecimalMatcher implements NumberParseMatcher {
                  result.quantity.truncate();
                  segment.setOffset(lastSeparatorOffset);
              }
-        } else if (separator != -1 && !groupingEnabled) {
+        } else if (separator != -1 && groupingDisabled) {
              // The final separator was a grouping separator, but we aren't accepting grouping.
              // Reset the offset to immediately before that grouping separator.
              result.quantity.adjustMagnitude(-currGroup);
              result.quantity.truncate();
              segment.setOffset(lastSeparatorOffset);
-        } else if (separator != -1 && requireGroupingMatch && groupingUniSet.contains(separator)
+        } else if (separator != -1
+                && requireGroupingMatch
+                && groupingUniSet.contains(separator)
                  && currGroup != grouping1) {
              // The final separator was a grouping separator, and we have a mismatched grouping size.
              // Reset the offset to the beginning of the number.
@@ -252,24 +276,25 @@ public class DecimalMatcher implements NumberParseMatcher {
              // segment.setOffset(initialOffset);
          }
  
-        return segment.length() == 0 || hasPartialPrefix || segment.isLeadingSurrogate();
+        return segment.length() == 0 || hasPartialPrefix;
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        leadChars.addAll(UnicodeSetStaticCache.getLeadChars(Key.DIGITS));
+    public UnicodeSet getLeadCodePoints() {
+        if (digitStrings == null && leadSet != null) {
+            return leadSet;
+        }
+
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        // Assumption: the sets are all single code points.
+        leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS));
+        leadCodePoints.addAll(separatorSet);
          if (digitStrings != null) {
              for (int i = 0; i < digitStrings.length; i++) {
-                ParsingUtils.putLeadingChar(digitStrings[i], leadChars, ignoreCase);
+                ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints);
              }
          }
-        if (separatorLeadChars != null) {
-            leadChars.addAll(separatorLeadChars);
-        } else {
-            ParsingUtils.putLeadSurrogates(separatorSet, leadChars);
-        }
-        return leadChars.freeze();
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java

index 610572dea8cf7dfb12519b4c37de361a5551d720..854af3cde6b953439f54a74d14ee46cd74ff9fbe 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java
@@ -26,13 +26,13 @@ public class IgnorablesMatcher extends RangeMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
+    public UnicodeSet getLeadCodePoints() {
          if (this == DEFAULT) {
-            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.DEFAULT_IGNORABLES);
          } else if (this == STRICT) {
-            return UnicodeSetStaticCache.getLeadChars(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.STRICT_IGNORABLES);
          } else {
-            return super.getLeadChars(ignoreCase);
+            return super.getLeadCodePoints();
          }
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherUtils.java

deleted file mode 100644 (file)

index 325837f..0000000
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherUtils.java
+++ /dev/null
@@ -1,15 +0,0 @@
-// © 2017 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-package com.ibm.icu.impl.number.parse;
-
-/**
- * @author sffc
- *
- */
-public class MatcherUtils {
-    public static boolean isValidCodePoint(int cp) {
-        return Character.isValidCodePoint(cp)
-                && (Character.isSupplementaryCodePoint(cp) || !Character.isSurrogate((char) cp));
-    }
-
-}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java

index 63d7972693740e67a49f73ecc31b37d420bf41dd..a1187ce24ce75f4f9012501bd4b872033f2f4bc4 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
@@ -11,23 +12,34 @@ import com.ibm.icu.text.UnicodeSet;
   */
  public class NanMatcher extends SymbolMatcher {
  
-    private static final NanMatcher DEFAULT = new NanMatcher();
+    private static final NanMatcher DEFAULT = new NanMatcher("NaN");
+    private static final NanMatcher DEFAULT_FOLDED = new NanMatcher(UCharacter.foldCase("NaN", true));
  
-    public static NanMatcher getInstance(DecimalFormatSymbols symbols) {
-        String symbolString = symbols.getNaN();
+    public static NanMatcher getInstance(DecimalFormatSymbols symbols, int parseFlags) {
+        String symbolString = ParsingUtils.maybeFold(symbols.getNaN(), parseFlags);
          if (DEFAULT.string.equals(symbolString)) {
              return DEFAULT;
+        } else if (DEFAULT_FOLDED.string.equals(symbolString)) {
+            return DEFAULT_FOLDED;
          } else {
              return new NanMatcher(symbolString);
          }
      }
  
      private NanMatcher(String symbolString) {
-        super(symbolString, DEFAULT.uniSet);
+        super(symbolString, UnicodeSet.EMPTY);
      }
  
-    private NanMatcher() {
-        super("NaN", UnicodeSet.EMPTY);
+    @Override
+    public UnicodeSet getLeadCodePoints() {
+        // Overriding this here to allow use of statically allocated sets
+        if (this == DEFAULT) {
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_N);
+        } else if (this == DEFAULT_FOLDED) {
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_N);
+        } else {
+            return super.getLeadCodePoints();
+        }
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java

index 760c30013c9894135e2b07d3e3a664f1747837d3..82893ed7d11a86c6828cb31fe23aca1df02e1d7e 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java
@@ -29,7 +29,7 @@ public interface NumberParseMatcher {
       * this matcher unless a segment begins with a char in this set. To make this matcher always run, return
       * {@link UnicodeSet#ALL_CODE_POINTS}.
       */
-    public UnicodeSet getLeadChars(boolean ignoreCase);
+    public UnicodeSet getLeadCodePoints();
  
      /**
       * Method called at the end of a parse, after all matchers have failed to consume any more chars. Allows a matcher
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java

index 9bc433516f3ff739cc2517053a9f68ca874bca37..8137130ad636adfed22a690f13ed03a1216004b8 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
@@ -14,8 +14,10 @@ import com.ibm.icu.impl.number.CustomSymbolCurrency;
  import com.ibm.icu.impl.number.DecimalFormatProperties;
  import com.ibm.icu.impl.number.Parse.ParseMode;
  import com.ibm.icu.impl.number.PatternStringParser;
+import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
  import com.ibm.icu.impl.number.PropertiesAffixPatternProvider;
  import com.ibm.icu.impl.number.RoundingUtils;
+import com.ibm.icu.number.Grouper;
  import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.util.Currency;
@@ -33,23 +35,26 @@ public class NumberParserImpl {
      public static NumberParserImpl createParserFromPattern(String pattern, boolean strictGrouping) {
          // Temporary frontend for testing.
  
-        NumberParserImpl parser = new NumberParserImpl(true, true);
+        int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
+                | ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
+        if (strictGrouping) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
+        }
+
+        NumberParserImpl parser = new NumberParserImpl(parseFlags, true);
          ULocale locale = new ULocale("en_IN");
          DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
          IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT;
  
-        AffixPatternProvider patternInfo = PatternStringParser.parseToPatternInfo(pattern);
-        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, true);
+        ParsedPatternInfo patternInfo = PatternStringParser.parseToPatternInfo(pattern);
+        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
+
+        Grouper grouper = Grouper.defaults().withLocaleData(patternInfo);
  
          parser.addMatcher(ignorables);
-        DecimalMatcher decimalMatcher = new DecimalMatcher();
-        decimalMatcher.requireGroupingMatch = strictGrouping;
-        decimalMatcher.grouping1 = 3;
-        decimalMatcher.grouping2 = 2;
-        decimalMatcher.freeze(symbols, false, false);
-        parser.addMatcher(decimalMatcher);
+        parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
          parser.addMatcher(MinusSignMatcher.getInstance(symbols));
-        parser.addMatcher(new ScientificMatcher(symbols));
+        parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
          parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
          parser.addMatcher(new RequireNumberMatcher());
  
@@ -90,7 +95,8 @@ public class NumberParserImpl {
                  currency = Currency.getInstance(result.currencyCode);
              } else {
                  assert 0 != (result.flags & ParsedNumber.FLAG_HAS_DEFAULT_CURRENCY);
-                currency = CustomSymbolCurrency.resolve(properties.getCurrency(), symbols.getULocale(), symbols);
+                currency = CustomSymbolCurrency
+                        .resolve(properties.getCurrency(), symbols.getULocale(), symbols);
              }
              return new CurrencyAmount(result.getNumber(), currency);
          } else {
@@ -110,23 +116,44 @@ public class NumberParserImpl {
              DecimalFormatSymbols symbols,
              boolean parseCurrency,
              boolean optimize) {
-        NumberParserImpl parser = new NumberParserImpl(!properties.getParseCaseSensitive(), optimize);
+
          ULocale locale = symbols.getULocale();
+        AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
          Currency currency = CustomSymbolCurrency.resolve(properties.getCurrency(), locale, symbols);
          boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
-        IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
-
          boolean decimalSeparatorRequired = properties.getDecimalPatternMatchRequired()
-                ? (properties.getDecimalSeparatorAlwaysShown() || properties.getMaximumFractionDigits() != 0)
+                ? (properties.getDecimalSeparatorAlwaysShown()
+                        || properties.getMaximumFractionDigits() != 0)
                  : false;
+        Grouper grouper = Grouper.defaults().withProperties(properties);
+        int parseFlags = 0;
+        if (!properties.getParseCaseSensitive()) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_IGNORE_CASE;
+        }
+        if (properties.getParseIntegerOnly()) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_INTEGER_ONLY;
+        }
+        if (isStrict) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE;
+        } else {
+            parseFlags |= ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
+        }
+        if (grouper.getPrimary() == -1) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_GROUPING_DISABLED;
+        }
+        if (parseCurrency || patternInfo.hasCurrencySign()) {
+            parseFlags |= ParsingUtils.PARSE_FLAG_MONETARY_SEPARATORS;
+        }
+        IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
+
+        NumberParserImpl parser = new NumberParserImpl(parseFlags, optimize);
  
          //////////////////////
          /// AFFIX MATCHERS ///
          //////////////////////
  
          // Set up a pattern modifier with mostly defaults to generate AffixMatchers.
-        AffixPatternProvider patternInfo = new PropertiesAffixPatternProvider(properties);
-        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, !isStrict);
+        AffixMatcher.generateFromAffixPatternProvider(patternInfo, parser, ignorables, parseFlags);
  
          ////////////////////////
          /// CURRENCY MATCHER ///
@@ -134,18 +161,20 @@ public class NumberParserImpl {
  
          if (parseCurrency || patternInfo.hasCurrencySign()) {
              parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
-            parser.addMatcher(CurrencyMatcher.getInstance(currency, locale));
+            parser.addMatcher(CurrencyMatcher.getInstance(currency, locale, parseFlags));
          }
  
          ///////////////////////////////
          /// OTHER STANDARD MATCHERS ///
          ///////////////////////////////
  
-        if (!isStrict || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN) || properties.getSignAlwaysShown()) {
+        if (!isStrict
+                || patternInfo.containsSymbolType(AffixUtils.TYPE_PLUS_SIGN)
+                || properties.getSignAlwaysShown()) {
              parser.addMatcher(PlusSignMatcher.getInstance(symbols));
          }
          parser.addMatcher(MinusSignMatcher.getInstance(symbols));
-        parser.addMatcher(NanMatcher.getInstance(symbols));
+        parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags));
          parser.addMatcher(PercentMatcher.getInstance(symbols));
          parser.addMatcher(PermilleMatcher.getInstance(symbols));
          parser.addMatcher(InfinityMatcher.getInstance(symbols));
@@ -154,17 +183,9 @@ public class NumberParserImpl {
              parser.addMatcher(new PaddingMatcher(padString));
          }
          parser.addMatcher(ignorables);
-        DecimalMatcher decimalMatcher = new DecimalMatcher();
-        decimalMatcher.requireGroupingMatch = isStrict;
-        decimalMatcher.groupingEnabled = properties.getGroupingSize() > 0;
-        decimalMatcher.decimalEnabled = properties.getDecimalPatternMatchRequired() ? decimalSeparatorRequired : true;
-        decimalMatcher.grouping1 = properties.getGroupingSize();
-        decimalMatcher.grouping2 = properties.getSecondaryGroupingSize();
-        decimalMatcher.integerOnly = properties.getParseIntegerOnly();
-        decimalMatcher.freeze(symbols, parseCurrency || patternInfo.hasCurrencySign(), isStrict);
-        parser.addMatcher(decimalMatcher);
+        parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
          if (!properties.getParseNoExponent()) {
-            parser.addMatcher(new ScientificMatcher(symbols));
+            parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
          }
  
          //////////////////
@@ -195,9 +216,9 @@ public class NumberParserImpl {
          return parser;
      }
  
-    private final boolean ignoreCase;
+    private final int parseFlags;
      private final List<NumberParseMatcher> matchers;
-    private final List<UnicodeSet> leadCharses;
+    private final List<UnicodeSet> leadCodePointses;
      private Comparator<ParsedNumber> comparator;
      private boolean frozen;
  
@@ -205,43 +226,44 @@ public class NumberParserImpl {
       * Creates a new, empty parser.
       *
       * @param ignoreCase
-     *            If true, perform case-folding. This parameter needs to go into the constructor because its value is
-     *            used during the construction of the matcher chain.
+     *            If true, perform case-folding. This parameter needs to go into the constructor because
+     *            its value is used during the construction of the matcher chain.
       * @param optimize
-     *            If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing runtime but increases
-     *            construction runtime. If the parser is going to be used only once or twice, set this to false; if it
-     *            is going to be used hundreds of times, set it to true.
+     *            If true, compute "lead chars" UnicodeSets for the matchers. This reduces parsing
+     *            runtime but increases construction runtime. If the parser is going to be used only once
+     *            or twice, set this to false; if it is going to be used hundreds of times, set it to
+     *            true.
       */
-    public NumberParserImpl(boolean ignoreCase, boolean optimize) {
+    public NumberParserImpl(int parseFlags, boolean optimize) {
          matchers = new ArrayList<NumberParseMatcher>();
          if (optimize) {
-            leadCharses = new ArrayList<UnicodeSet>();
+            leadCodePointses = new ArrayList<UnicodeSet>();
          } else {
-            leadCharses = null;
+            leadCodePointses = null;
          }
          comparator = ParsedNumber.COMPARATOR; // default value
-        this.ignoreCase = ignoreCase;
+        this.parseFlags = parseFlags;
          frozen = false;
      }
  
      public void addMatcher(NumberParseMatcher matcher) {
          assert !frozen;
          this.matchers.add(matcher);
-        if (leadCharses != null) {
-            UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
-            assert leadChars.isFrozen();
-            this.leadCharses.add(leadChars);
+        if (leadCodePointses != null) {
+            UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
+            assert leadCodePoints.isFrozen();
+            this.leadCodePointses.add(leadCodePoints);
          }
      }
  
      public void addMatchers(Collection<? extends NumberParseMatcher> matchers) {
          assert !frozen;
          this.matchers.addAll(matchers);
-        if (leadCharses != null) {
+        if (leadCodePointses != null) {
              for (NumberParseMatcher matcher : matchers) {
-                UnicodeSet leadChars = matcher.getLeadChars(ignoreCase);
-                assert leadChars.isFrozen();
-                this.leadCharses.add(leadChars);
+                UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
+                assert leadCodePoints.isFrozen();
+                this.leadCodePointses.add(leadCodePoints);
              }
          }
      }
@@ -263,8 +285,8 @@ public class NumberParserImpl {
       * Primary entrypoint to parsing code path.
       *
       * @param input
-     *            The string to parse. This is a String, not CharSequence, to enforce assumptions about immutability
-     *            (CharSequences are not guaranteed to be immutable).
+     *            The string to parse. This is a String, not CharSequence, to enforce assumptions about
+     *            immutability (CharSequences are not guaranteed to be immutable).
       * @param start
       *            The index into the string at which to start parsing.
       * @param greedy
@@ -274,7 +296,7 @@ public class NumberParserImpl {
       */
      public void parse(String input, int start, boolean greedy, ParsedNumber result) {
          assert frozen;
-        StringSegment segment = new StringSegment(input, ignoreCase);
+        StringSegment segment = new StringSegment(ParsingUtils.maybeFold(input, parseFlags));
          segment.adjustOffset(start);
          if (greedy) {
              parseGreedyRecursive(segment, result);
@@ -293,10 +315,9 @@ public class NumberParserImpl {
          }
  
          int initialOffset = segment.getOffset();
-        char leadChar = leadCharses == null ? 0
-                : ignoreCase ? ParsingUtils.getCaseFoldedLeadingChar(segment) : segment.charAt(0);
+        int leadCp = segment.getCodePoint();
          for (int i = 0; i < matchers.size(); i++) {
-            if (leadCharses != null && !leadCharses.get(i).contains(leadChar)) {
+            if (leadCodePointses != null && !leadCodePointses.get(i).contains(leadCp)) {
                  continue;
              }
              NumberParseMatcher matcher = matchers.get(i);
@@ -304,7 +325,8 @@ public class NumberParserImpl {
              if (segment.getOffset() != initialOffset) {
                  // In a greedy parse, recurse on only the first match.
                  parseGreedyRecursive(segment, result);
-                // The following line resets the offset so that the StringSegment says the same across the function
+                // The following line resets the offset so that the StringSegment says the same across
+                // the function
                  // call boundary. Since we recurse only once, this line is not strictly necessary.
                  segment.setOffset(initialOffset);
                  return;
@@ -329,10 +351,11 @@ public class NumberParserImpl {
          for (int i = 0; i < matchers.size(); i++) {
              NumberParseMatcher matcher = matchers.get(i);
              // In a non-greedy parse, we attempt all possible matches and pick the best.
-            for (int charsToConsume = 1; charsToConsume <= segment.length(); charsToConsume++) {
-                candidate.copyFrom(initial);
+            for (int charsToConsume = 0; charsToConsume < segment.length();) {
+                charsToConsume += Character.charCount(Character.codePointAt(segment, charsToConsume));
  
                  // Run the matcher on a segment of the current length.
+                candidate.copyFrom(initial);
                  segment.setLength(charsToConsume);
                  boolean maybeMore = matcher.match(segment, candidate);
                  segment.resetLength();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java

index 412dd4983107f8801d60eeed2522678e570fdde1..cda00b9aa7a3575a9c95e43cb20b4016a6a68b49 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java
@@ -3,7 +3,6 @@
  package com.ibm.icu.impl.number.parse;
  
  import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.UTF16;
  import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.text.UnicodeSet.EntryRange;
  
@@ -12,50 +11,40 @@ import com.ibm.icu.text.UnicodeSet.EntryRange;
   */
  public class ParsingUtils {
  
-    /**
-     * Adds all chars and lead surrogates from input into output.
-     */
-    public static void putLeadSurrogates(UnicodeSet input, UnicodeSet output) {
-        if (input.isEmpty()) {
-            return;
-        }
+    public static final int PARSE_FLAG_IGNORE_CASE = 0x0001;
+    public static final int PARSE_FLAG_MONETARY_SEPARATORS = 0x0002;
+    public static final int PARSE_FLAG_STRICT_SEPARATORS = 0x0004;
+    public static final int PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008;
+    public static final int PARSE_FLAG_INTEGER_ONLY = 0x0010;
+    public static final int PARSE_FLAG_GROUPING_DISABLED = 0x0020;
+    public static final int PARSE_FLAG_DECIMAL_SCIENTIFIC = 0x0040;
+    public static final int PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080;
+
+    public static void putLeadCodePoints(UnicodeSet input, UnicodeSet output) {
          for (EntryRange range : input.ranges()) {
-            if (range.codepointEnd <= 0xFFFF) {
-                // All BMP chars
-                output.add(range.codepoint, range.codepointEnd);
-            } else {
-                // Need to get the lead surrogates
-                // TODO: Make this more efficient?
-                if (range.codepoint <= 0xFFFF) {
-                    output.add(range.codepoint, 0xFFFF);
-                }
-                for (int cp = Math.max(0x10000, range.codepoint); cp <= range.codepointEnd; cp++) {
-                    output.add(UTF16.getLeadSurrogate(cp));
-                }
-            }
+            output.add(range.codepoint, range.codepointEnd);
+        }
+        for (String str : input.strings()) {
+            output.add(str.codePointAt(0));
          }
      }
  
-    /**
-     * Adds the first char of the given string to leadChars, performing case-folding if necessary.
-     */
-    public static void putLeadingChar(String str, UnicodeSet leadChars, boolean ignoreCase) {
-        if (str.isEmpty()) {
-            return;
-        }
-        if (ignoreCase) {
-            leadChars.add(getCaseFoldedLeadingChar(str));
-        } else {
-            leadChars.add(str.charAt(0));
+    public static void putLeadCodePoint(String input, UnicodeSet output) {
+        if (!input.isEmpty()) {
+            output.add(input.codePointAt(0));
          }
      }
  
-    public static char getCaseFoldedLeadingChar(CharSequence str) {
-        int cp = UCharacter.foldCase(Character.codePointAt(str, 0), true);
-        if (cp <= 0xFFFF) {
-            return (char) cp;
+    private static final UnicodeSet LETTERS = new UnicodeSet("[:letter:]").freeze();
+
+    /**
+     * Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string.
+     */
+    public static String maybeFold(String input, int parseFlags) {
+        if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && LETTERS.containsSome(input)) {
+            return UCharacter.foldCase(input, true);
          } else {
-            return UTF16.getLeadSurrogate(cp);
+            return input;
          }
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java

index 8451059359ba0c632a10cfeb5a0de5e6cb512056..36d7076a9f6994bd855d9fbeec6d5fd5b9536591 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java
@@ -35,7 +35,7 @@ public abstract class RangeMatcher implements NumberParseMatcher {
              }
  
              // If we get here, the code point didn't match the uniSet.
-            return segment.isLeadingSurrogate();
+            return false;
          }
  
          // If we get here, we consumed the entire string segment.
@@ -43,10 +43,10 @@ public abstract class RangeMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadSurrogates(uniSet, leadChars);
-        return leadChars.freeze();
+    public UnicodeSet getLeadCodePoints() {
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java

index 7f9dd7da519bf079cbd4d8cea3318eec39be645e..71ef5d5f1af3173d501736a30df400157e380c4e 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
+import com.ibm.icu.number.Grouper;
  import com.ibm.icu.text.DecimalFormatSymbols;
  import com.ibm.icu.text.UnicodeSet;
  
@@ -14,13 +15,19 @@ public class ScientificMatcher implements NumberParseMatcher {
      private final String exponentSeparatorString;
      private final DecimalMatcher exponentMatcher;
  
-    public ScientificMatcher(DecimalFormatSymbols symbols) {
-        exponentSeparatorString = symbols.getExponentSeparator();
-        exponentMatcher = new DecimalMatcher();
-        exponentMatcher.isScientific = true;
-        exponentMatcher.groupingEnabled = false;
-        exponentMatcher.decimalEnabled = false;
-        exponentMatcher.freeze(symbols, false, false);
+    public static ScientificMatcher getInstance(
+            DecimalFormatSymbols symbols,
+            Grouper grouper,
+            int parseFlags) {
+        // TODO: Static-initialize most common instances?
+        return new ScientificMatcher(symbols, grouper, parseFlags);
+    }
+
+    private ScientificMatcher(DecimalFormatSymbols symbols, Grouper grouper, int parseFlags) {
+        exponentSeparatorString = ParsingUtils.maybeFold(symbols.getExponentSeparator(), parseFlags);
+        exponentMatcher = DecimalMatcher.getInstance(symbols,
+                grouper,
+                ParsingUtils.PARSE_FLAG_DECIMAL_SCIENTIFIC | ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
      }
  
      @Override
@@ -76,10 +83,15 @@ public class ScientificMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadingChar(exponentSeparatorString, leadChars, ignoreCase);
-        return leadChars.freeze();
+    public UnicodeSet getLeadCodePoints() {
+        int cp = exponentSeparatorString.codePointAt(0);
+        if (cp == 'E') {
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_E);
+        } else if (cp == 'e') {
+            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_E);
+        } else {
+            return new UnicodeSet().add(cp).freeze();
+        }
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java

index abf747d78ad1c5fcc9d8c941444b7aef45f94877..0bf8b0c33e24d5ce2d69d3b3bbf83078e158d127 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java
@@ -2,11 +2,9 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
-import com.ibm.icu.lang.UCharacter;
-
  /**
- * A mutable class allowing for a String with a variable offset and length. The charAt, length, and subSequence methods
- * all operate relative to the fixed offset into the String.
+ * A mutable class allowing for a String with a variable offset and length. The charAt, length, and
+ * subSequence methods all operate relative to the fixed offset into the String.
   *
   * @author sffc
   */
@@ -14,13 +12,11 @@ public class StringSegment implements CharSequence {
      private final String str;
      private int start;
      private int end;
-    private final boolean ignoreCase;
  
-    public StringSegment(String str, boolean ignoreCase) {
+    public StringSegment(String str) {
          this.str = str;
          this.start = 0;
          this.end = str.length();
-        this.ignoreCase = ignoreCase;
      }
  
      public int getOffset() {
@@ -66,7 +62,8 @@ public class StringSegment implements CharSequence {
      }
  
      /**
-     * Returns the first code point in the string segment, or -1 if the string starts with an invalid code point.
+     * Returns the first code point in the string segment, or -1 if the string starts with an invalid
+     * code point.
       */
      public int getCodePoint() {
          assert start < end;
@@ -81,36 +78,17 @@ public class StringSegment implements CharSequence {
      }
  
      /**
-     * Returns whether the segment is one char in length, and that the char is a leading surrogate.
-     */
-    public boolean isLeadingSurrogate() {
-        return (end - start == 1) && Character.isHighSurrogate(str.charAt(start));
-    }
-
-    /**
-     * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For example, if this
-     * string segment is "aab", and the char sequence is "aac", this method returns 2, since the first 2 characters are
-     * the same.
+     * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
+     * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
+     * since the first 2 characters are the same.
       */
      public int getCommonPrefixLength(CharSequence other) {
          int offset = 0;
          for (; offset < Math.min(length(), other.length());) {
-            if (ignoreCase) {
-                // NOTE: Character.codePointAt() returns the leading surrogate if it is the only char left in the
-                // string. UCharacter.foldCase() will simply return the same integer since it is not a valid code point.
-                int cp1 = Character.codePointAt(this, offset);
-                int cp2 = Character.codePointAt(other, offset);
-                if (cp1 != cp2 && UCharacter.foldCase(cp1, true) != UCharacter.foldCase(cp2, true)) {
-                    break;
-                }
-                offset += Character.charCount(cp1);
-            } else {
-                // Case folding is not necessary. Use a slightly faster code path comparing chars with chars.
-                if (charAt(offset) != other.charAt(offset)) {
-                    break;
-                }
-                offset++;
+            if (charAt(offset) != other.charAt(offset)) {
+                break;
              }
+            offset++;
          }
          return offset;
      }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java

index 11af03339d19ea96a74fdce349cdaf68cf27c94c..863e9c83a893bae7949077a49a935b833126403a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java
@@ -11,7 +11,6 @@ import com.ibm.icu.text.UnicodeSet;
  public abstract class SymbolMatcher implements NumberParseMatcher {
      protected final String string;
      protected final UnicodeSet uniSet;
-    protected final UnicodeSet leadChars;
  
      // TODO: Implement this class using only UnicodeSet and not String?
      // How to deal with case folding?
@@ -19,13 +18,11 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
      protected SymbolMatcher(String symbolString, UnicodeSet symbolUniSet) {
          string = symbolString;
          uniSet = symbolUniSet;
-        leadChars = null;
      }
  
      protected SymbolMatcher(UnicodeSetStaticCache.Key key) {
          string = "";
          uniSet = UnicodeSetStaticCache.get(key);
-        leadChars = UnicodeSetStaticCache.getLeadChars(key);
      }
  
      @Override
@@ -43,7 +40,7 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
          }
  
          if (string.isEmpty()) {
-            return segment.isLeadingSurrogate();
+            return false;
          }
          int overlap = segment.getCommonPrefixLength(string);
          if (overlap == string.length()) {
@@ -51,19 +48,20 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
              accept(segment, result);
              return false;
          }
-        return overlap == segment.length() || segment.isLeadingSurrogate();
+        return overlap == segment.length();
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
-        if (leadChars != null) {
-            return leadChars;
+    public UnicodeSet getLeadCodePoints() {
+        if (string == null || string.isEmpty()) {
+            // Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
+            return uniSet;
          }
  
-        UnicodeSet leadChars = new UnicodeSet();
-        ParsingUtils.putLeadSurrogates(uniSet, leadChars);
-        ParsingUtils.putLeadingChar(string, leadChars, ignoreCase);
-        return leadChars.freeze();
+        UnicodeSet leadCodePoints = new UnicodeSet();
+        ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
+        ParsingUtils.putLeadCodePoint(string, leadCodePoints);
+        return leadCodePoints.freeze();
      }
  
      @Override
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java

index 3a3a2d70f116ac48528af5eb06c4d6578ccdb141..dad2bb7ed9c828d6ea94916bae49c8428c246437 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
@@ -8,8 +8,14 @@ import java.util.Map;
  import com.ibm.icu.text.UnicodeSet;
  
  /**
- * @author sffc
+ * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
+ * bring a very sizeable performance boost.
+ *
+ * IMPORTANT ASSUMPTION: All of the sets contain code points (no strings) and they are all case-folded.
+ * If this assumption were ever broken, logic in classes such as SymbolMatcher would need to be updated
+ * in order to return well-formed sets upon calls to getLeadCodePoints().
   *
+ * @author sffc
   */
  public class UnicodeSetStaticCache {
      public static enum Key {
@@ -42,19 +48,26 @@ public class UnicodeSetStaticCache {
  
          // Other
          DIGITS,
+        CAPITAL_N,
+        FOLDED_N,
+        CAPITAL_E,
+        FOLDED_E,
+
+        // Combined Separators with Digits (for lead code points)
+        DIGITS_OR_COMMA_OR_OTHER,
+        DIGITS_OR_PERIOD_OR_OTHER,
+        DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
+        DIGITS_OR_STRICT_COMMA_OR_OTHER,
+        DIGITS_OR_STRICT_PERIOD_OR_OTHER,
+        DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
      };
  
      private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<Key, UnicodeSet>(Key.class);
-    private static final Map<Key, UnicodeSet> leadCharsSets = new EnumMap<Key, UnicodeSet>(Key.class);
  
      public static UnicodeSet get(Key key) {
          return unicodeSets.get(key);
      }
  
-    public static UnicodeSet getLeadChars(Key key) {
-        return leadCharsSets.get(key);
-    }
-
      public static Key chooseFrom(String str, Key key1) {
          return get(key1).contains(str) ? key1 : null;
      }
@@ -107,6 +120,23 @@ public class UnicodeSetStaticCache {
              // Strict 1'234.567
              return Key.STRICT_PERIOD_OR_OTHER;
  
+        } else if (key1 == Key.COMMA_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_COMMA_OR_OTHER;
+
+        } else if (key1 == Key.PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_STRICT_COMMA_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER;
+
+        } else if (key1 == Key.STRICT_COMMA_OR_PERIOD_OR_OTHER && key2 == Key.DIGITS) {
+            return Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER;
          }
  
          return null;
@@ -143,8 +173,10 @@ public class UnicodeSetStaticCache {
          unicodeSets.put(Key.PERIOD_OR_OTHER, computeUnion(Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
          unicodeSets.put(Key.COMMA_OR_PERIOD_OR_OTHER,
                  computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
-        unicodeSets.put(Key.STRICT_COMMA_OR_OTHER, computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
-        unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER, computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_COMMA_OR_OTHER,
+                computeUnion(Key.STRICT_COMMA, Key.OTHER_GROUPING_SEPARATORS));
+        unicodeSets.put(Key.STRICT_PERIOD_OR_OTHER,
+                computeUnion(Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
          unicodeSets.put(Key.STRICT_COMMA_OR_PERIOD_OR_OTHER,
                  computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
  
@@ -157,11 +189,20 @@ public class UnicodeSetStaticCache {
          unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
  
          unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
-
-        for (Key key : Key.values()) {
-            UnicodeSet leadChars = new UnicodeSet();
-            ParsingUtils.putLeadSurrogates(get(key), leadChars);
-            leadCharsSets.put(key, leadChars.freeze());
-        }
+        unicodeSets.put(Key.CAPITAL_N, new UnicodeSet("[N]").freeze());
+        unicodeSets.put(Key.FOLDED_N, new UnicodeSet("[n]").freeze());
+        unicodeSets.put(Key.CAPITAL_E, new UnicodeSet("[E]").freeze());
+        unicodeSets.put(Key.FOLDED_E, new UnicodeSet("[e]").freeze());
+
+        unicodeSets.put(Key.DIGITS_OR_COMMA_OR_OTHER, computeUnion(Key.DIGITS, Key.COMMA_OR_OTHER));
+        unicodeSets.put(Key.DIGITS_OR_PERIOD_OR_OTHER, computeUnion(Key.DIGITS, Key.PERIOD_OR_OTHER));
+        unicodeSets.put(Key.DIGITS_OR_COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.DIGITS, Key.COMMA_OR_PERIOD_OR_OTHER));
+        unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_OTHER,
+                computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_OTHER));
+        unicodeSets.put(Key.DIGITS_OR_STRICT_PERIOD_OR_OTHER,
+                computeUnion(Key.DIGITS, Key.STRICT_PERIOD_OR_OTHER));
+        unicodeSets.put(Key.DIGITS_OR_STRICT_COMMA_OR_PERIOD_OR_OTHER,
+                computeUnion(Key.DIGITS, Key.STRICT_COMMA_OR_PERIOD_OR_OTHER));
      }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java

index 4fbbe814ddcfbfad9de309ddc6325a898068a58f..bfe5a6b54919b9c93add7e0fb6d7005e89a16e7a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java
@@ -15,7 +15,7 @@ public abstract class ValidationMatcher implements NumberParseMatcher {
      }
  
      @Override
-    public UnicodeSet getLeadChars(boolean ignoreCase) {
+    public UnicodeSet getLeadCodePoints() {
          return UnicodeSet.EMPTY;
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/Grouper.java b/icu4j/main/classes/core/src/com/ibm/icu/number/Grouper.java

index b3e4318c871a815e08feaf326cfd5300da1c9de7..3b31e2ee3183114c5ef12bd5923d11ae9f40b6a2 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/number/Grouper.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/number/Grouper.java
@@ -2,6 +2,7 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.number;
  
+import com.ibm.icu.impl.number.DecimalFormatProperties;
  import com.ibm.icu.impl.number.DecimalQuantity;
  import com.ibm.icu.impl.number.PatternStringParser.ParsedPatternInfo;
  
@@ -84,7 +85,30 @@ public class Grouper {
          }
      }
  
-    Grouper withLocaleData(ParsedPatternInfo patternInfo) {
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public Grouper withProperties(DecimalFormatProperties properties) {
+        if (grouping1 != -2) {
+            return this;
+        }
+        byte grouping1 = (byte) properties.getGroupingSize();
+        byte grouping2 = (byte) properties.getSecondaryGroupingSize();
+        int minGrouping = properties.getMinimumGroupingDigits();
+        grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : -1;
+        grouping2 = grouping2 > 0 ? grouping2 : grouping1;
+        // TODO: Is it important to handle minGrouping > 2?
+        return getInstance(grouping1, grouping2, minGrouping == 2);
+    }
+
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public Grouper withLocaleData(ParsedPatternInfo patternInfo) {
          if (grouping1 != -2) {
              return this;
          }
@@ -112,4 +136,22 @@ public class Grouper {
                  && (position % grouping2) == 0
                  && value.getUpperDisplayMagnitude() - grouping1 + 1 >= (min2 ? 2 : 1);
      }
+
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public byte getPrimary() {
+        return grouping1;
+    }
+
+    /**
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public byte getSecondary() {
+        return grouping2;
+    }
  }
 \ No newline at end of file
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberPropertyMapper.java b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberPropertyMapper.java

index ff69c077fa3bd97e7b620b07d036f1f4600fc882..8abcc5c84eae00000829aefc512dd582e793bf07 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberPropertyMapper.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberPropertyMapper.java
@@ -193,14 +193,7 @@ final class NumberPropertyMapper {
          // GROUPING STRATEGY //
          ///////////////////////
  
-        int grouping1 = properties.getGroupingSize();
-        int grouping2 = properties.getSecondaryGroupingSize();
-        int minGrouping = properties.getMinimumGroupingDigits();
-        assert grouping1 >= -2; // value of -2 means to forward no grouping information
-        grouping1 = grouping1 > 0 ? grouping1 : grouping2 > 0 ? grouping2 : grouping1;
-        grouping2 = grouping2 > 0 ? grouping2 : grouping1;
-        // TODO: Is it important to handle minGrouping > 2?
-        macros.grouper = Grouper.getInstance((byte) grouping1, (byte) grouping2, minGrouping == 2);
+        macros.grouper = Grouper.defaults().withProperties(properties);
  
          /////////////
          // PADDING //
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java

index ccba72a584995221597bb2dfc518ab9230493b89..ff64a36f11f90b10025b078c6f2f6d91a9385b3f 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java
@@ -35,14 +35,14 @@ public class NumberParserTest {
                  { 3, "𝟱𝟭𝟰𝟮𝟯x", "0", 10, 51423. },
                  { 3, " 𝟱𝟭𝟰𝟮𝟯", "0", 11, 51423. },
                  { 3, "𝟱𝟭𝟰𝟮𝟯 ", "0", 10, 51423. },
-                { 7, "𝟱𝟭,𝟰𝟮𝟯", "0", 11, 51423. },
-                { 7, "𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", "0", 19, 78951423. },
-                { 4, "𝟳𝟴,𝟵𝟱𝟭,𝟰𝟮𝟯", "0", 11, 78951. },
-                { 7, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "0", 18, 78951.423 },
-                { 7, "𝟳𝟴,𝟬𝟬𝟬", "0", 11, 78000. },
-                { 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "0", 18, 78000. },
-                { 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 18, 78000.023 },
-                { 7, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "0", 11, 78. },
+                { 7, "𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 11, 51423. },
+                { 7, "𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 19, 78951423. },
+                { 4, "𝟳𝟴,𝟵𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 11, 78951. },
+                { 7, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "#,##,##0", 18, 78951.423 },
+                { 7, "𝟳𝟴,𝟬𝟬𝟬", "#,##,##0", 11, 78000. },
+                { 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "#,##,##0", 18, 78000. },
+                { 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 18, 78000.023 },
+                { 7, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 11, 78. },
                  { 3, "-𝟱𝟭𝟰𝟮𝟯", "0", 11, -51423. },
                  { 3, "-𝟱𝟭𝟰𝟮𝟯-", "0", 11, -51423. },
                  { 3, "a51423US dollars", "a0¤¤¤", 16, 51423. },
@@ -68,9 +68,11 @@ public class NumberParserTest {
                  { 3, "𝟱.𝟭𝟰𝟮E𝟯", "0", 12, 5142. },
                  { 3, "𝟱.𝟭𝟰𝟮E-𝟯", "0", 13, 0.005142 },
                  { 3, "𝟱.𝟭𝟰𝟮e-𝟯", "0", 13, 0.005142 },
-                { 3, "5,142.50 Canadian dollars", "0", 25, 5142.5 },
+                { 7, "5,142.50 Canadian dollars", "#,##,##0", 25, 5142.5 },
                  // { 3, "a$  b5", "a ¤ b0", 6, 5.0 }, // TODO: Does not work
-                { 7, ".00", "0", 3, 0.0 },
+                { 3, "📺1.23", "📺0;📻0", 6, 1.23 },
+                { 3, "📻1.23", "📺0;📻0", 6, -1.23 },
+                { 3, ".00", "0", 3, 0.0 },
                  { 3, "0", "0", 1, 0.0 } };
  
          for (Object[] cas : cases) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java

index 86878bec0603585c6514955f63cfb69209a777be..016fa581c98653baa06d8d3798f72a4ce213e36d 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java
@@ -3,8 +3,6 @@
  package com.ibm.icu.dev.test.number;
  
  import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
  
  import org.junit.Test;
  
@@ -19,7 +17,7 @@ public class StringSegmentTest {
  
      @Test
      public void testOffset() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+        StringSegment segment = new StringSegment(SAMPLE_STRING);
          assertEquals(0, segment.getOffset());
          segment.adjustOffset(3);
          assertEquals(3, segment.getOffset());
@@ -31,7 +29,7 @@ public class StringSegmentTest {
  
      @Test
      public void testLength() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+        StringSegment segment = new StringSegment(SAMPLE_STRING);
          assertEquals(11, segment.length());
          segment.adjustOffset(3);
          assertEquals(8, segment.length());
@@ -45,7 +43,7 @@ public class StringSegmentTest {
  
      @Test
      public void testCharAt() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+        StringSegment segment = new StringSegment(SAMPLE_STRING);
          assertCharSequenceEquals(SAMPLE_STRING, segment);
          segment.adjustOffset(3);
          assertCharSequenceEquals("radio 📻", segment);
@@ -55,7 +53,7 @@ public class StringSegmentTest {
  
      @Test
      public void testGetCodePoint() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+        StringSegment segment = new StringSegment(SAMPLE_STRING);
          assertEquals(0x1F4FB, segment.getCodePoint());
          segment.setLength(1);
          assertEquals(-1, segment.getCodePoint());
@@ -66,20 +64,9 @@ public class StringSegmentTest {
          assertEquals(0x20, segment.getCodePoint());
      }
  
-    @Test
-    public void testIsLeadingSurrogate() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
-        assertFalse(segment.isLeadingSurrogate());
-        segment.setLength(1);
-        assertTrue(segment.isLeadingSurrogate());
-        segment.adjustOffset(1);
-        segment.setLength(1);
-        assertFalse(segment.isLeadingSurrogate()); // trail, not lead
-    }
-
      @Test
      public void testCommonPrefixLength() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, false);
+        StringSegment segment = new StringSegment(SAMPLE_STRING);
          assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
          assertEquals(4, segment.getCommonPrefixLength("📻 r"));
          assertEquals(3, segment.getCommonPrefixLength("📻 x"));
@@ -101,15 +88,6 @@ public class StringSegmentTest {
          assertEquals(0, segment.getCommonPrefixLength("foo"));
      }
  
-    @Test
-    public void testIgnoreCase() {
-        StringSegment segment = new StringSegment(SAMPLE_STRING, true);
-        assertEquals(11, segment.getCommonPrefixLength(SAMPLE_STRING));
-        assertEquals(0, segment.getCommonPrefixLength("x"));
-        segment.setOffset(3);
-        assertEquals(5, segment.getCommonPrefixLength("RAdiO"));
-    }
-
      private static void assertCharSequenceEquals(CharSequence a, CharSequence b) {
          assertEquals(a.length(), b.length());
          for (int i = 0; i < a.length(); i++) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TextTrieMapTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TextTrieMapTest.java

index 4266d9c37d6ad73277230dcdfe676ac123da31ba..291708208b72f778f4bb4eeb2a5df7683af70280 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TextTrieMapTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TextTrieMapTest.java
@@ -17,6 +17,7 @@ import org.junit.runners.JUnit4;
  
  import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.impl.TextTrieMap;
+import com.ibm.icu.text.UnicodeSet;
  
  @RunWith(JUnit4.class)
  public class TextTrieMapTest extends TestFmwk {
@@ -33,6 +34,7 @@ public class TextTrieMapTest extends TestFmwk {
      private static final Integer SUP2 = new Integer(9);
      private static final Integer SUP3 = new Integer(10);
      private static final Integer SUP4 = new Integer(11);
+    private static final Integer SUP5 = new Integer(12);
  
      private static final Integer FOO = new Integer(-1);
      private static final Integer BAR = new Integer(-2);
@@ -63,6 +65,9 @@ public class TextTrieMapTest extends TestFmwk {
          {"L📺1", SUP2}, // L, 0xD83D, 0xDCFA, 1
          {"L📻", SUP3}, // L, 0xD83D, 0xDCFB
          {"L🃏", SUP4}, // L, 0xD83C, 0xDCCF
+        {"📺", SUP5}, // 0xD83D, 0xDCFA
+        {"📻", SUP5}, // 0xD83D, 0xDCFB
+        {"🃏", SUP5}, // 0xD83C, 0xDCCF
      };
  
      private static final Object[][] TESTCASES = {
@@ -174,6 +179,30 @@ public class TextTrieMapTest extends TestFmwk {
              checkParse(map, test, expecteds, true);
          }
  
+        logln("Test for partial match");
+        for (Object[] cas : TESTDATA) {
+            String str = (String) cas[0];
+            for (int i = 0; i < str.length() - 1; i++) {
+                TextTrieMap.Output output = new TextTrieMap.Output();
+                map.get(str.substring(0, i), 0, output);
+                assertTrue("Partial string means partial match", output.partialMatch);
+            }
+            String bad = str + "x";
+            TextTrieMap.Output output = new TextTrieMap.Output();
+            map.get(bad, 0, output);
+            assertFalse("No partial match on bad string", output.partialMatch);
+        }
+        TextTrieMap.Output output = new TextTrieMap.Output();
+        map.get("Sunday", 0, output);
+        assertFalse("No partial match on string with no continuation", output.partialMatch);
+
+        logln("Test for LeadCodePoints");
+        // Note: The 📺 and 📻 have the same lead surrogate
+        UnicodeSet expectedLeadCodePoints = new UnicodeSet("[SMTWFL📺📻🃏]");
+        UnicodeSet actualLeadCodePoints = new UnicodeSet();
+        map.putLeadCodePoints(actualLeadCodePoints);
+        assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
+
          // Add duplicated entry
          map.put("Sunday", FOO);
          // Add duplicated entry with different casing
@@ -217,6 +246,29 @@ public class TextTrieMapTest extends TestFmwk {
              checkParse(map, test, expecteds, false);
          }
  
+        logln("Test for partial match");
+        for (Object[] cas : TESTDATA) {
+            String str = (String) cas[0];
+            for (int i = 0; i < str.length() - 1; i++) {
+                TextTrieMap.Output output = new TextTrieMap.Output();
+                map.get(str.substring(0, i), 0, output);
+                assertTrue("Partial string means partial match", output.partialMatch);
+            }
+            String bad = str + "x";
+            TextTrieMap.Output output = new TextTrieMap.Output();
+            map.get(bad, 0, output);
+            assertFalse("No partial match on bad string", output.partialMatch);
+        }
+        TextTrieMap.Output output = new TextTrieMap.Output();
+        map.get("Sunday", 0, output);
+        assertFalse("No partial match on string with no continuation", output.partialMatch);
+
+        logln("Test for LeadCodePoints");
+        UnicodeSet expectedLeadCodePoints = new UnicodeSet("[smtwfl📺📻🃏]");
+        UnicodeSet actualLeadCodePoints = new UnicodeSet();
+        map.putLeadCodePoints(actualLeadCodePoints);
+        assertEquals("leadCodePoints", expectedLeadCodePoints, actualLeadCodePoints);
+
          // Add duplicated entry
          map.put("Sunday", FOO);
          // Add duplicated entry with different casing
author	Shane Carr <shane@unicode.org>
	Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)
committer	Shane Carr <shane@unicode.org>
	Thu, 18 Jan 2018 10:50:36 +0000 (10:50 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/TextTrieMap.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyTrieMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/IgnorablesMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/MatcherUtils.java	[deleted file]	patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParseMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/RangeMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/SymbolMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ValidationMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/number/Grouper.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/number/NumberPropertyMapper.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/StringSegmentTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TextTrieMapTest.java		patch \| blob \| history