ICU-13513 Generalizing UnicodeSetStaticCache to cover more locales.

author Shane Carr <shane@unicode.org>

Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)

committer Shane Carr <shane@unicode.org>

Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)
author Shane Carr <shane@unicode.org>
Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)
committer Shane Carr <shane@unicode.org>
Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java

index a1187ce24ce75f4f9012501bd4b872033f2f4bc4..7664e1e72b259d34a5caca78cc42bcf8e635629a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java
@@ -33,10 +33,10 @@ public class NanMatcher extends SymbolMatcher {
      @Override
      public UnicodeSet getLeadCodePoints() {
          // Overriding this here to allow use of statically allocated sets
-        if (this == DEFAULT) {
-            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_N);
-        } else if (this == DEFAULT_FOLDED) {
-            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_N);
+        int leadCp = string.codePointAt(0);
+        UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.NAN_LEAD);
+        if (s.contains(leadCp)) {
+            return s;
          } else {
              return super.getLeadCodePoints();
          }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java

index 8137130ad636adfed22a690f13ed03a1216004b8..01d5b20600b0b67ec3ed763987057adb24319fc6 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java
@@ -32,7 +32,8 @@ import com.ibm.icu.util.ULocale;
   */
  public class NumberParserImpl {
      @Deprecated
-    public static NumberParserImpl createParserFromPattern(String pattern, boolean strictGrouping) {
+    public static NumberParserImpl createParserFromPattern(
+            ULocale locale, String pattern, boolean strictGrouping) {
          // Temporary frontend for testing.
  
          int parseFlags = ParsingUtils.PARSE_FLAG_IGNORE_CASE
@@ -42,7 +43,6 @@ public class NumberParserImpl {
          }
  
          NumberParserImpl parser = new NumberParserImpl(parseFlags, true);
-        ULocale locale = new ULocale("en_IN");
          DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
          IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT;
  
@@ -54,6 +54,7 @@ public class NumberParserImpl {
          parser.addMatcher(ignorables);
          parser.addMatcher(DecimalMatcher.getInstance(symbols, grouper, parseFlags));
          parser.addMatcher(MinusSignMatcher.getInstance(symbols));
+        parser.addMatcher(NanMatcher.getInstance(symbols, parseFlags));
          parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper, parseFlags));
          parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
          parser.addMatcher(new RequireNumberMatcher());
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java

index cda00b9aa7a3575a9c95e43cb20b4016a6a68b49..892f00f0f9acf97c41f6a2b73da01b85cba49364 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java
@@ -35,13 +35,12 @@ public class ParsingUtils {
          }
      }
  
-    private static final UnicodeSet LETTERS = new UnicodeSet("[:letter:]").freeze();
-
      /**
       * Case-folds the string if IGNORE_CASE flag is set; otherwise, returns the same string.
       */
      public static String maybeFold(String input, int parseFlags) {
-        if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && LETTERS.containsSome(input)) {
+        UnicodeSet cwcf = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CWCF);
+        if (0 != (parseFlags & PARSE_FLAG_IGNORE_CASE) && cwcf.containsSome(input)) {
              return UCharacter.foldCase(input, true);
          } else {
              return input;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java

index 71ef5d5f1af3173d501736a30df400157e380c4e..c05e75fa80e4ef0aae995708792dd0aac8f814e1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java
@@ -84,13 +84,12 @@ public class ScientificMatcher implements NumberParseMatcher {
  
      @Override
      public UnicodeSet getLeadCodePoints() {
-        int cp = exponentSeparatorString.codePointAt(0);
-        if (cp == 'E') {
-            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.CAPITAL_E);
-        } else if (cp == 'e') {
-            return UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.FOLDED_E);
+        int leadCp = exponentSeparatorString.codePointAt(0);
+        UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.SCIENTIFIC_LEAD);
+        if (s.contains(leadCp)) {
+            return s;
          } else {
-            return new UnicodeSet().add(cp).freeze();
+            return new UnicodeSet().add(leadCp).freeze();
          }
      }
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java

index dad2bb7ed9c828d6ea94916bae49c8428c246437..3da729f80a10cb7730c20414caed85fc9141b085 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
@@ -48,10 +48,9 @@ public class UnicodeSetStaticCache {
  
          // Other
          DIGITS,
-        CAPITAL_N,
-        FOLDED_N,
-        CAPITAL_E,
-        FOLDED_E,
+        NAN_LEAD,
+        SCIENTIFIC_LEAD,
+        CWCF,
  
          // Combined Separators with Digits (for lead code points)
          DIGITS_OR_COMMA_OR_OTHER,
@@ -189,10 +188,12 @@ public class UnicodeSetStaticCache {
          unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
  
          unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
-        unicodeSets.put(Key.CAPITAL_N, new UnicodeSet("[N]").freeze());
-        unicodeSets.put(Key.FOLDED_N, new UnicodeSet("[n]").freeze());
-        unicodeSets.put(Key.CAPITAL_E, new UnicodeSet("[E]").freeze());
-        unicodeSets.put(Key.FOLDED_E, new UnicodeSet("[e]").freeze());
+        // Note: locale fi translation of NaN starts with 'e' (conflicts with scientific?)
+        unicodeSets.put(Key.NAN_LEAD,
+                new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]")
+                        .freeze());
+        unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze());
+        unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());
  
          unicodeSets.put(Key.DIGITS_OR_COMMA_OR_OTHER, computeUnion(Key.DIGITS, Key.COMMA_OR_OTHER));
          unicodeSets.put(Key.DIGITS_OR_PERIOD_OR_OTHER, computeUnion(Key.DIGITS, Key.PERIOD_OR_OTHER));
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java

index ff64a36f11f90b10025b078c6f2f6d91a9385b3f..665398e6784bc51e13dc5ac4a3085b8435c24851 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java
@@ -4,11 +4,13 @@ package com.ibm.icu.dev.test.number;
  
  import static org.junit.Assert.assertEquals;
  import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
  
  import org.junit.Test;
  
  import com.ibm.icu.impl.number.parse.NumberParserImpl;
  import com.ibm.icu.impl.number.parse.ParsedNumber;
+import com.ibm.icu.util.ULocale;
  
  /**
   * @author sffc
@@ -69,7 +71,7 @@ public class NumberParserTest {
                  { 3, "𝟱.𝟭𝟰𝟮E-𝟯", "0", 13, 0.005142 },
                  { 3, "𝟱.𝟭𝟰𝟮e-𝟯", "0", 13, 0.005142 },
                  { 7, "5,142.50 Canadian dollars", "#,##,##0", 25, 5142.5 },
-                // { 3, "a$  b5", "a ¤ b0", 6, 5.0 }, // TODO: Does not work
+                // { 3, "a$ b5", "a ¤ b0", 6, 5.0 }, // TODO: Does not work
                  { 3, "📺1.23", "📺0;📻0", 6, 1.23 },
                  { 3, "📻1.23", "📺0;📻0", 6, -1.23 },
                  { 3, ".00", "0", 3, 0.0 },
@@ -81,7 +83,8 @@ public class NumberParserTest {
              String pattern = (String) cas[2];
              int expectedCharsConsumed = (Integer) cas[3];
              double resultDouble = (Double) cas[4];
-            NumberParserImpl parser = NumberParserImpl.createParserFromPattern(pattern, false);
+            NumberParserImpl parser = NumberParserImpl
+                    .createParserFromPattern(ULocale.ENGLISH, pattern, false);
              String message = "Input <" + input + "> Parser " + parser;
  
              if (0 != (flags & 0x01)) {
@@ -104,7 +107,7 @@ public class NumberParserTest {
  
              if (0 != (flags & 0x04)) {
                  // Test with strict separators
-                parser = NumberParserImpl.createParserFromPattern(pattern, true);
+                parser = NumberParserImpl.createParserFromPattern(ULocale.ENGLISH, pattern, true);
                  ParsedNumber resultObject = new ParsedNumber();
                  parser.parse(input, true, resultObject);
                  assertNotNull(message, resultObject.quantity);
@@ -113,4 +116,21 @@ public class NumberParserTest {
              }
          }
      }
+
+    @Test
+    public void testLocaleFi() {
+        // This case is interesting because locale fi has NaN starting with 'e', the same as scientific
+        NumberParserImpl parser = NumberParserImpl
+                .createParserFromPattern(new ULocale("fi"), "0", false);
+
+        ParsedNumber resultObject = new ParsedNumber();
+        parser.parse("epäluku", false, resultObject);
+        assertTrue(resultObject.success());
+        assertEquals(Double.NaN, resultObject.getNumber().doubleValue(), 0.0);
+
+        resultObject = new ParsedNumber();
+        parser.parse("1.2e3", false, resultObject);
+        assertTrue(resultObject.success());
+        assertEquals(12000.0, resultObject.getNumber().doubleValue(), 0.0);
+    }
  }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java

new file mode 100644 (file)

index 0000000..1701186
--- /dev/null
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java
@@ -0,0 +1,107 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.dev.test.number;
+
+import static com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.get;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache;
+import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.DecimalFormatSymbols;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * @author sffc
+ *
+ */
+public class UnicodeSetStaticCacheTest {
+
+    @Test
+    public void testSetCoverage() {
+        // Lenient comma/period should be supersets of strict comma/period;
+        // it also makes the coverage logic cheaper.
+        assertTrue("COMMA should be superset of STRICT_COMMA",
+                get(Key.COMMA).containsAll(get(Key.STRICT_COMMA)));
+        assertTrue("PERIOD should be superset of STRICT_PERIOD",
+                get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD)));
+
+        UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD))
+                .freeze();
+        UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS))
+                .freeze();
+        UnicodeSet plusSign = get(Key.PLUS_SIGN);
+        UnicodeSet minusSign = get(Key.MINUS_SIGN);
+        UnicodeSet percent = get(Key.PERCENT_SIGN);
+        UnicodeSet permille = get(Key.PERMILLE_SIGN);
+        UnicodeSet infinity = get(Key.INFINITY);
+        UnicodeSet nanLead = get(Key.NAN_LEAD);
+        UnicodeSet scientificLead = get(Key.SCIENTIFIC_LEAD);
+
+        for (ULocale locale : ULocale.getAvailableLocales()) {
+            DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
+
+            assertInSet(locale, decimals, dfs.getDecimalSeparatorString());
+            assertInSet(locale, grouping, dfs.getGroupingSeparatorString());
+            assertInSet(locale, plusSign, dfs.getPlusSignString());
+            assertInSet(locale, minusSign, dfs.getMinusSignString());
+            assertInSet(locale, percent, dfs.getPercentString());
+            assertInSet(locale, permille, dfs.getPerMillString());
+            assertInSet(locale, infinity, dfs.getInfinity());
+            assertInSet(locale, nanLead, dfs.getNaN().codePointAt(0));
+            assertInSet(locale, nanLead, UCharacter.foldCase(dfs.getNaN(), true).codePointAt(0));
+            assertInSet(locale,
+                    scientificLead,
+                    UCharacter.foldCase(dfs.getExponentSeparator(), true).codePointAt(0));
+        }
+    }
+
+    @Test
+    public void testFrozen() {
+        for (Key key : Key.values()) {
+            assertTrue(get(key).isFrozen());
+        }
+    }
+
+    @Test
+    public void testUnions() {
+        for (Key key1 : Key.values()) {
+            for (Key key2 : Key.values()) {
+                Key key3 = UnicodeSetStaticCache.unionOf(key1, key2);
+                if (key3 != null) {
+                    UnicodeSet s1 = get(key1);
+                    UnicodeSet s2 = get(key2);
+                    UnicodeSet s3 = get(key3);
+                    UnicodeSet s1_s2 = s1.cloneAsThawed().addAll(s2);
+                    assertEquals(key1 + "/" + key2 + "/" + key3, s1_s2, s3);
+                }
+            }
+        }
+    }
+
+    static void assertInSet(ULocale locale, UnicodeSet set, String str) {
+        if (str.codePointCount(0, str.length()) != 1) {
+            // Ignore locale strings with more than one code point (usually a bidi mark)
+            return;
+        }
+        assertInSet(locale, set, str.codePointAt(0));
+    }
+
+    static void assertInSet(ULocale locale, UnicodeSet set, int cp) {
+        // If this test case fails, add the specified code point to the corresponding set in
+        // UnicodeSetStaticCache.java
+        assertTrue(
+                locale
+                        + " U+"
+                        + Integer.toHexString(cp)
+                        + " ("
+                        + UCharacter.toString(cp)
+                        + ") should be in "
+                        + set,
+                set.contains(cp));
+    }
+}
author	Shane Carr <shane@unicode.org>
	Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)
committer	Shane Carr <shane@unicode.org>
	Fri, 19 Jan 2018 05:03:44 +0000 (05:03 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NanMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ParsingUtils.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/ScientificMatcher.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/UnicodeSetStaticCacheTest.java	[new file with mode: 0644]	patch \| blob