ICU-13309 Changing number parsing to accept only horizontal whitespace, not vertical...

author Shane Carr <shane@unicode.org>

Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)

committer Shane Carr <shane@unicode.org>

Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)
author Shane Carr <shane@unicode.org>
Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)
committer Shane Carr <shane@unicode.org>
Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java

index 79bc9e038744cdf6068bc10c8029e751f22b7431..50d0f3c70ee911df6617d5beca9325e44ab3ccaf 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
@@ -132,9 +132,10 @@ public class Parse {
      INSIDE_AFFIX_PATTERN;
    }
  
-  // TODO: Does this set make sense for the whitespace characters?
+  // This set was decided after discussion with icu-design@. See ticket #13309.
+  // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
    private static final UnicodeSet UNISET_WHITESPACE =
-      new UnicodeSet("[[:whitespace:][\\u2000-\\u200D]]").freeze();
+      new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
  
    // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
    private static final UnicodeSet UNISET_BIDI =
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java

index 5772a08c56f6a2af7db423b2f45f40acc2818807..73bd53415ffb7b76f99400c95b31da128a0b0b50 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
@@ -202,7 +202,9 @@ import com.ibm.icu.util.ULocale.Category;
   * pattern string and the input string. For example, the pattern "# %" matches "35 %" (with a single
   * space), "35%" (with no space), "35&nbsp;%" (with a non-breaking space), and "35&nbsp; %" (with
   * multiple spaces). Arbitrary ignorables are also allowed at boundaries between the parts of the
- * number: prefix, number, exponent separator, and suffix.
+ * number: prefix, number, exponent separator, and suffix. Ignorable whitespace characters are those
+ * having the Unicode "blank" property for regular expressions, defined in UTS #18 Annex C, which is
+ * "horizontal" whitespace, like spaces and tabs, but not "vertical" whitespace, like line breaks.
   *
   * <p>If {@link #parse(String, ParsePosition)} fails to parse a string, it returns <code>null</code>
   * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java

index dab77744d63110c9273656543a32ea214bdedf09..e97ca196dbfe0469eac36ac0b05988b8bf9e01fe 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java
@@ -424,6 +424,9 @@ public abstract class NumberFormat extends UFormat {
       * <p>Does not throw an exception; if no object can be parsed, index is
       * unchanged!
       *
+     * <p>For more detail on parsing, see the "Parsing" header in the class
+     * documentation of {@link DecimalFormat}.
+     *
       * @see #isParseIntegerOnly
       * @see DecimalFormat#setParseBigDecimal
       * @see java.text.Format#parseObject(String, ParsePosition)
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index 102ca85a07a530fe6781382b9b6c9c645860b3ee..aabb7ce3ae02c974765e266c085456dae22b122c 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -63,6 +63,7 @@ import com.ibm.icu.text.NumberFormat.SimpleNumberFormatFactory;
  import com.ibm.icu.text.NumberingSystem;
  import com.ibm.icu.text.PluralRules;
  import com.ibm.icu.text.RuleBasedNumberFormat;
+import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.util.Currency;
  import com.ibm.icu.util.Currency.CurrencyUsage;
  import com.ibm.icu.util.CurrencyAmount;
@@ -438,8 +439,8 @@ public class NumberFormatTest extends TestFmwk {
                  {" $ 124 ", "6", "-1"},
                  {"124$", "3", "-1"},
                  {"124 $", "3", "-1"},
-                {"$124\u200D", "4", "-1"},
-                {"$\u200D124", "5", "-1"},
+                {"$124\u200A", "4", "-1"},
+                {"$\u200A124", "5", "-1"},
          };
          NumberFormat foo = NumberFormat.getCurrencyInstance();
          for (int i = 0; i < DATA.length; ++i) {
@@ -1712,6 +1713,29 @@ public class NumberFormatTest extends TestFmwk {
          expect(fmt, "ab  1234", n);
          expect(fmt, "a b1234", n);
          expect(fmt, "a   b1234", n);
+        expect(fmt, " a b 1234", n);
+
+        // Horizontal whitespace is allowed, but not vertical whitespace.
+        expect(fmt, "\ta\u00A0b\u20001234", n);
+        expect(fmt, "a   \u200A    b1234", n);
+        expectParseException(fmt, "\nab1234", n);
+        expectParseException(fmt, "a    \n   b1234", n);
+        expectParseException(fmt, "a    \u0085   b1234", n);
+        expectParseException(fmt, "a    \u2028   b1234", n);
+
+        // Test all characters in the UTS 18 "blank" set stated in the API docstring.
+        UnicodeSet blanks = new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
+        for (String space : blanks) {
+            String str = "a  " + space + "  b1234";
+            expect(fmt, str, n);
+        }
+
+        // Test that other whitespace characters do not work
+        UnicodeSet otherWhitespace = new UnicodeSet("[[:whitespace:]]").removeAll(blanks).freeze();
+        for (String space : otherWhitespace) {
+            String str = "a  " + space + "  b1234";
+            expectParseException(fmt, str, n);
+        }
      }
  
      /**
@@ -2676,6 +2700,16 @@ public class NumberFormatTest extends TestFmwk {
          expect(fmt, str, new Long(n));
      }
  
+    /** Parse test */
+    public void expectParseException(DecimalFormat fmt, String str, Number n) {
+        Number num = null;
+        try {
+            num = fmt.parse(str);
+            errln("Expected failure, but passed: " + n + " on " + fmt.toPattern() + " -> " + num);
+        } catch (ParseException e) {
+        }
+    }
+
      private void expectCurrency(NumberFormat nf, Currency curr,
              double value, String string) {
          DecimalFormat fmt = (DecimalFormat) nf;
author	Shane Carr <shane@unicode.org>
	Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)
committer	Shane Carr <shane@unicode.org>
	Thu, 26 Oct 2017 21:53:50 +0000 (21:53 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history