From: Shane Carr <shane@unicode.org>
Date: Wed, 29 Mar 2017 02:31:49 +0000 (+0000)
Subject: ICU-13060 Documentation and parsing whitespace/bidi tweaks.
X-Git-Tag: release-59-rc~21
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=08dcc7e82e0433576f74ddb7ff3fd5f2c97eef6e;p=icu

ICU-13060 Documentation and parsing whitespace/bidi tweaks.

X-SVN-Rev: 39951
---

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
index ae8d6d91f20..a04c2433f38 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
@@ -11,6 +11,7 @@ import java.util.Iterator;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
+import com.ibm.icu.impl.PatternProps;
 import com.ibm.icu.impl.StandardPlural;
 import com.ibm.icu.impl.TextTrieMap;
 import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier;
@@ -1129,7 +1130,7 @@ public class Parse {
             }
             acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
             acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
@@ -1155,7 +1156,7 @@ public class Parse {
             // We encountered a decimal point
             acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
             acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
             if (state.length > 0 && mode == ParseMode.FAST) break;
@@ -1184,7 +1185,7 @@ public class Parse {
             break;
 
           case AFTER_EXPONENT_DIGIT:
-            acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
             acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
             acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
             if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
@@ -1257,17 +1258,17 @@ public class Parse {
             break;
 
           case INSIDE_STRING:
-            acceptStringOffset(cp, state, item);
+            long added0 = acceptStringOffset(cp, state, item);
             // Accept arbitrary bidi in the middle of strings.
-            if (state.length == 0 && UNISET_BIDI.contains(cp)) {
+            if (added0 == 0L && UNISET_BIDI.contains(cp)) {
               state.getNext().copyFrom(item, item.name, cp);
             }
             break;
 
           case INSIDE_AFFIX_PATTERN:
-            long added = acceptAffixPatternOffset(cp, state, item);
-            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes.
-            if (added == 0L && isIgnorable(cp, state)) {
+            long added1 = acceptAffixPatternOffset(cp, state, item);
+            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns.
+            if (added1 == 0L && isIgnorable(cp, state)) {
               state.getNext().copyFrom(item, item.name, cp);
             }
             break;
@@ -1758,8 +1759,8 @@ public class Parse {
     if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
   }
 
-  private static void acceptStringOffset(int cp, ParserState state, StateItem item) {
-    acceptString(
+  private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
+    return acceptString(
         cp,
         item.returnTo1,
         item.returnTo2,
@@ -1813,20 +1814,20 @@ public class Parse {
       referenceCp = Character.codePointAt(str, offset);
       count = Character.charCount(referenceCp);
       equals = codePointEquals(cp, referenceCp, state);
-      if (!UNISET_BIDI.contains(cp)) break;
+      if (!UNISET_BIDI.contains(referenceCp)) break;
     }
 
     if (equals) {
       // Matches first code point of the string
       StateItem next = state.getNext().copyFrom(item, null, cp);
 
-      // Skip over ignorable code points in the middle of the string.
+      // Skip over bidi code points in the middle or end of the string.
       // They will be accepted in the main loop.
       offset += count;
       for (; offset < str.length(); offset += count) {
         referenceCp = Character.codePointAt(str, offset);
         count = Character.charCount(referenceCp);
-        if (!UNISET_BIDI.contains(cp)) break;
+        if (!UNISET_BIDI.contains(referenceCp)) break;
       }
 
       if (offset < str.length()) {
@@ -1879,7 +1880,7 @@ public class Parse {
       tag = AffixPatternUtils.nextToken(tag, str);
       typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
       hasNext = AffixPatternUtils.hasNext(tag, str);
-      if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break;
+      if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break;
     }
 
     // Convert from the returned tag to a code point, string, or currency to check
@@ -1926,18 +1927,16 @@ public class Parse {
     while (hasNext) {
       long futureTag = AffixPatternUtils.nextToken(tag, str);
       int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag);
-      if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break;
+      if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break;
       tag = futureTag;
       typeOrCp = futureTypeOrCp;
       hasNext = AffixPatternUtils.hasNext(tag, str);
     }
 
     long added = 0L;
-    if (resolvedCp >= 0) {
+    if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
       // Code point
-      if (!codePointEquals(cp, resolvedCp, state)) return 0L;
       StateItem next = state.getNext().copyFrom(item, null, cp);
-
       if (hasNext) {
         // Additional tokens in affix string.
         next.name = StateName.INSIDE_AFFIX_PATTERN;
@@ -1948,6 +1947,7 @@ public class Parse {
         next.trailingCount = 0;
         next.returnTo1 = null;
       }
+      next.score += 1; // reward for consuming code point from pattern
       added |= 1L << state.lastInsertedIndex();
     }
     if (resolvedMinusSign) {
@@ -2180,11 +2180,26 @@ public class Parse {
    *
    * @param cp The code point to test. Returns false if cp is negative.
    * @param state The current {@link ParserState}, used for determining strict mode.
-   * @return true if cp is bidi or whitespace in lenient mode; false otherwise.
+   * @return true if cp is ignorable; false otherwise.
    */
   private static boolean isIgnorable(int cp, ParserState state) {
     if (cp < 0) return false;
     if (UNISET_BIDI.contains(cp)) return true;
     return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
   }
+
+  /**
+   * Checks whether the given code point is "ignorable" in pattern syntax. This includes all
+   * characters that are normally ignorable in {@link #isIgnorable} plus characters having the
+   * Pattern_White_Space property.
+   *
+   * @param cp The code point to test. Returns false if cp is negative.
+   * @param state The current {@link ParserState}, used for determining strict mode.
+   * @return true if cp is ignorable; false otherwise.
+   */
+  private static boolean isPatternIgnorable(int cp, ParserState state) {
+    if (cp < 0) return false;
+    if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true;
+    return isIgnorable(cp, state);
+  }
 }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
index fbe9915b548..dbdb41fe578 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
@@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category;
  * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
  * parse failure by throwing a {@link java.text.ParseException}.
  *
+ * <p>Under the hood, a state table parsing engine is used. To debug a parsing failure during
+ * development, use the following pattern to print details about the state table transitions:
+ *
+ * <pre>
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = true;
+ * df.parse("123.45", ppos);
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = false;
+ * </pre>
+ *
  * <h3>Thread Safety and Best Practices</h3>
  *
  * <p>Starting with ICU 59, instance of DecimalFormat are thread-safe.
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
index 0b4d412a89c..83ece37db3a 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
@@ -1439,3 +1439,62 @@ parse	output	breaks
 // K and J return null
  9 999	9999	JK
 
+test parse ignorables
+set locale ar
+// Note: Prefixes contain RLMs, as do some of the test cases.
+set pattern x aâb0c df 
+set negativePrefix y gâh
+set negativeSuffix i jk 
+begin
+parse	output	breaks
+x aâb56c df 	56
+x  aâb56c df 	56	K
+x ab56c df 	56	K
+x ab56c df	56	JK
+x ab56c df  	56	K
+x ab56	56	JK
+x a b56	56	JK
+56cdf	56	JK
+56c df	56	JK
+56cd f	56	JK
+56câdâf	56	JK
+56cdf 	56	JK
+56c dâf 	56	JK
+56âc df 	56	JK
+y gâh56i jk 	-56
+y  gâh56i jk 	-56	KS
+y gh56i jk 	-56	K
+y gh56i jk	-56	JK
+y gh56i jk  	-56	K
+y gh56	-56	JK
+y g h56	-56	JKS
+// S stops parsing at the 'i' for these and returns -56
+56ijk	-56	JK
+56i jk	-56	JK
+56ij k	-56	JK
+56iâjâk	-56	JK
+56ijk 	-56	JK
+56i jâk 	-56	JK
+56âi jk 	-56	JK
+// S gets 56 (accepts ' ' gs grouping); J and K get null
+5 6	5	JKS
+5â6	5	JK
+
+test parse spaces in grouping
+// This test gives the ideal behavior of these cases, which
+// none of the implementations currently support.
+set locale en
+set pattern #,##0
+begin
+parse	output	breaks
+// J and S get "12" here
+1 2	1	JS
+1 23	1	JS
+// K gets 1 here; doesn't pick up the grouping separator
+1 234	1234	K
+
+
+
+
+
+
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
index 0ac2e58bacc..1ad5884c5cd 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk {
     @Test
     public void Test11839() {
         DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH);
-        dfs.setMinusSign('â¸');
-        dfs.setPlusSign('â'); //  â  U+2214 DOT PLUS
+        dfs.setMinusSignString("aâ¸");
+        dfs.setPlusSignString("bâ"); //  â  U+2214 DOT PLUS
         DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs);
         String result = df.format(-1.234);
-        assertEquals("Locale-specific minus sign should be used", "1.23â¸", result);
+        assertEquals("Locale-specific minus sign should be used", "1.23aâ¸", result);
         result = df.format(1.234);
-        assertEquals("Locale-specific plus sign should be used", "1.23â", result);
+        assertEquals("Locale-specific plus sign should be used", "1.23bâ", result);
+        // Test round-trip with parse
+        expect2(df, -456, "456.00aâ¸");
+        expect2(df, 456, "456.00bâ");
     }
 
     @Test
@@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk {
         }
     }
 
+    @Test
+    public void testParseIgnorables() {
+        // Also see the test case "test parse ignorables" in numberformattestspecification.txt
+        DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance();
+        dfs.setPercentString("\u200E%\u200E");
+        DecimalFormat df = new DecimalFormat("0%;-0a", dfs);
+        ParsePosition ppos = new ParsePosition(0);
+        Number result = df.parse("42\u200E%\u200E ", ppos);
+        assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex());
+        ppos.setIndex(0);
+        result = df.parse("-42a\u200E ", ppos);
+        assertEquals("Should parse as percent", new BigDecimal("-0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex());
+    }
+
     @Test
     public void testSignificantDigitsMode() {
         String[][] allExpected = {