ICU-13084 Updating set of ignorable control characters to [:DI:].

author Shane Carr <shane@unicode.org>

Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)

committer Shane Carr <shane@unicode.org>

Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)
author Shane Carr <shane@unicode.org>
Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)
committer Shane Carr <shane@unicode.org>
Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)
diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp

index fc0274f2a3a2a4ccf7a80de1606658d8b364328b..0a8ec2bebb2311100558773683bda2480d8bf2f8 100644 (file)
--- a/icu4c/source/i18n/numparse_unisets.cpp
+++ b/icu4c/source/i18n/numparse_unisets.cpp
@@ -63,8 +63,9 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
  
      gUnicodeSets[EMPTY] = new UnicodeSet();
  
-    // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
-    gUnicodeSets[BIDI] = new UnicodeSet(u"[[\\u200E\\u200F\\u061C]]", status);
+    // These characters are skipped over and ignored at any point in the string, even in strict mode.
+    // See ticket #13084.
+    gUnicodeSets[BIDI] = new UnicodeSet(u"[[:DI:]]", status);
  
      // This set was decided after discussion with icu-design@. See ticket #13309.
      // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java

index 5ab70817041d2eed239dd29b7919dc59032fa15f..edc0e99114f03c6a34cd039fd646433a569749e1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java
@@ -81,8 +81,9 @@ public class UnicodeSetStaticCache {
      }
  
      static {
-        // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
-        unicodeSets.put(Key.BIDI, new UnicodeSet("[[\\u200E\\u200F\\u061C]]").freeze());
+        // These characters are skipped over and ignored at any point in the string, even in strict mode.
+        // See ticket #13084.
+        unicodeSets.put(Key.BIDI, new UnicodeSet("[[:DI:]]").freeze());
  
          // This set was decided after discussion with icu-design@. See ticket #13309.
          // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java

index 5f68fe60469c7a3cb67a2c2a0f3c03e110bd9672..37e40646663bcb3842b7ab7a6c6289bbad19f9f4 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
@@ -198,7 +198,7 @@ import com.ibm.icu.util.ULocale.Category;
   * example, a formatter instance gotten from NumberFormat.getInstance(ULocale,
   * NumberFormat.CURRENCYSTYLE) can parse both "USD1.00" and "3.00 US dollars".
   *
- * <p>Whitespace characters (lenient mode) and bidi control characters (lenient and strict mode),
+ * <p>Whitespace characters (lenient mode) and control characters (lenient and strict mode),
   * collectively called "ignorables", do not need to match in identity or quantity between the
   * pattern string and the input string. For example, the pattern "# %" matches "35 %" (with a single
   * space), "35%" (with no space), "35&nbsp;%" (with a non-breaking space), and "35&nbsp; %" (with
@@ -206,6 +206,7 @@ import com.ibm.icu.util.ULocale.Category;
   * number: prefix, number, exponent separator, and suffix. Ignorable whitespace characters are those
   * having the Unicode "blank" property for regular expressions, defined in UTS #18 Annex C, which is
   * "horizontal" whitespace, like spaces and tabs, but not "vertical" whitespace, like line breaks.
+ * Ignorable control characters are those in the Unicode set [:Default_Ignorable_Code_Point:].
   *
   * <p>If {@link #parse(String, ParsePosition)} fails to parse a string, it returns <code>null</code>
   * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index 4697abd5da48ef8cd0fe1ba0555458d659f6472c..370a843f51049104e16c0f001c9cebbb77f3514a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -1722,11 +1722,32 @@ public class NumberFormatTest extends TestFmwk {
          // Test all characters in the UTS 18 "blank" set stated in the API docstring.
          UnicodeSet blanks = new UnicodeSet("[[:Zs:][\\u0009]]").freeze();
          for (String space : blanks) {
-            String str = "a  " + space + "  b1234";
+            String str = "a " + space + " b1234c  ";
+            expect(fmt, str, n);
+        }
+
+        // Arbitrary whitespace is not accepted in strict mode.
+        fmt.setParseStrict(true);
+        for (String space : blanks) {
+            String str = "a " + space + " b1234c  ";
+            expectParseException(fmt, str, n);
+        }
+
+        // Test default ignorable characters.  These should work in both lenient and strict.
+        UnicodeSet defaultIgnorables = new UnicodeSet("[[:Default_Ignorable_Code_Point:]]").freeze();
+        fmt.setParseStrict(false);
+        for (String ignorable : defaultIgnorables) {
+            String str = "a b " + ignorable + "1234c  ";
+            expect(fmt, str, n);
+        }
+        fmt.setParseStrict(true);
+        for (String ignorable : defaultIgnorables) {
+            String str = "a b " + ignorable + "1234c  ";
              expect(fmt, str, n);
          }
  
          // Test that other whitespace characters do not work
+        fmt.setParseStrict(false);
          UnicodeSet otherWhitespace = new UnicodeSet("[[:whitespace:]]").removeAll(blanks).freeze();
          for (String space : otherWhitespace) {
              String str = "a  " + space + "  b1234";
author	Shane Carr <shane@unicode.org>
	Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)
committer	Shane Carr <shane@unicode.org>
	Wed, 28 Feb 2018 03:42:32 +0000 (03:42 +0000)
icu4c/source/i18n/numparse_unisets.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history