From 7c59127769b93cc5560cbc7a591a6b9f8d19cb3d Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Thu, 26 Oct 2017 21:53:50 +0000 Subject: [PATCH] ICU-13309 Changing number parsing to accept only horizontal whitespace, not vertical whitespace or control characters. X-SVN-Rev: 40646 --- .../src/com/ibm/icu/impl/number/Parse.java | 5 ++- .../src/com/ibm/icu/text/DecimalFormat.java | 4 +- .../src/com/ibm/icu/text/NumberFormat.java | 3 ++ .../icu/dev/test/format/NumberFormatTest.java | 38 ++++++++++++++++++- 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java index 79bc9e03874..50d0f3c70ee 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java @@ -132,9 +132,10 @@ public class Parse { INSIDE_AFFIX_PATTERN; } - // TODO: Does this set make sense for the whitespace characters? + // This set was decided after discussion with icu-design@. See ticket #13309. + // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). private static final UnicodeSet UNISET_WHITESPACE = - new UnicodeSet("[[:whitespace:][\\u2000-\\u200D]]").freeze(); + new UnicodeSet("[[:Zs:][\\u0009]]").freeze(); // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. private static final UnicodeSet UNISET_BIDI = diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java index 5772a08c56f..73bd53415ff 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java @@ -202,7 +202,9 @@ import com.ibm.icu.util.ULocale.Category; * pattern string and the input string. For example, the pattern "# %" matches "35 %" (with a single * space), "35%" (with no space), "35 %" (with a non-breaking space), and "35  %" (with * multiple spaces). Arbitrary ignorables are also allowed at boundaries between the parts of the - * number: prefix, number, exponent separator, and suffix. + * number: prefix, number, exponent separator, and suffix. Ignorable whitespace characters are those + * having the Unicode "blank" property for regular expressions, defined in UTS #18 Annex C, which is + * "horizontal" whitespace, like spaces and tabs, but not "vertical" whitespace, like line breaks. * *

If {@link #parse(String, ParsePosition)} fails to parse a string, it returns null * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java index dab77744d63..e97ca196dbf 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/NumberFormat.java @@ -424,6 +424,9 @@ public abstract class NumberFormat extends UFormat { *

Does not throw an exception; if no object can be parsed, index is * unchanged! * + *

For more detail on parsing, see the "Parsing" header in the class + * documentation of {@link DecimalFormat}. + * * @see #isParseIntegerOnly * @see DecimalFormat#setParseBigDecimal * @see java.text.Format#parseObject(String, ParsePosition) diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index 102ca85a07a..aabb7ce3ae0 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -63,6 +63,7 @@ import com.ibm.icu.text.NumberFormat.SimpleNumberFormatFactory; import com.ibm.icu.text.NumberingSystem; import com.ibm.icu.text.PluralRules; import com.ibm.icu.text.RuleBasedNumberFormat; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.Currency; import com.ibm.icu.util.Currency.CurrencyUsage; import com.ibm.icu.util.CurrencyAmount; @@ -438,8 +439,8 @@ public class NumberFormatTest extends TestFmwk { {" $ 124 ", "6", "-1"}, {"124$", "3", "-1"}, {"124 $", "3", "-1"}, - {"$124\u200D", "4", "-1"}, - {"$\u200D124", "5", "-1"}, + {"$124\u200A", "4", "-1"}, + {"$\u200A124", "5", "-1"}, }; NumberFormat foo = NumberFormat.getCurrencyInstance(); for (int i = 0; i < DATA.length; ++i) { @@ -1712,6 +1713,29 @@ public class NumberFormatTest extends TestFmwk { expect(fmt, "ab 1234", n); expect(fmt, "a b1234", n); expect(fmt, "a b1234", n); + expect(fmt, " a b 1234", n); + + // Horizontal whitespace is allowed, but not vertical whitespace. + expect(fmt, "\ta\u00A0b\u20001234", n); + expect(fmt, "a \u200A b1234", n); + expectParseException(fmt, "\nab1234", n); + expectParseException(fmt, "a \n b1234", n); + expectParseException(fmt, "a \u0085 b1234", n); + expectParseException(fmt, "a \u2028 b1234", n); + + // Test all characters in the UTS 18 "blank" set stated in the API docstring. + UnicodeSet blanks = new UnicodeSet("[[:Zs:][\\u0009]]").freeze(); + for (String space : blanks) { + String str = "a " + space + " b1234"; + expect(fmt, str, n); + } + + // Test that other whitespace characters do not work + UnicodeSet otherWhitespace = new UnicodeSet("[[:whitespace:]]").removeAll(blanks).freeze(); + for (String space : otherWhitespace) { + String str = "a " + space + " b1234"; + expectParseException(fmt, str, n); + } } /** @@ -2676,6 +2700,16 @@ public class NumberFormatTest extends TestFmwk { expect(fmt, str, new Long(n)); } + /** Parse test */ + public void expectParseException(DecimalFormat fmt, String str, Number n) { + Number num = null; + try { + num = fmt.parse(str); + errln("Expected failure, but passed: " + n + " on " + fmt.toPattern() + " -> " + num); + } catch (ParseException e) { + } + } + private void expectCurrency(NumberFormat nf, Currency curr, double value, String string) { DecimalFormat fmt = (DecimalFormat) nf; -- 2.40.0