ICU-13060 Documentation and parsing whitespace/bidi tweaks.

author Shane Carr <shane@unicode.org>

Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)

committer Shane Carr <shane@unicode.org>

Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)
author Shane Carr <shane@unicode.org>
Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)
committer Shane Carr <shane@unicode.org>
Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java

index ae8d6d91f20cbe5a0551e246464001c27802c4d6..a04c2433f3816c4977a4b903e7b461ea0f1dcb3f 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
@@ -11,6 +11,7 @@ import java.util.Iterator;
  import java.util.Set;
  import java.util.concurrent.ConcurrentHashMap;
  
+import com.ibm.icu.impl.PatternProps;
  import com.ibm.icu.impl.StandardPlural;
  import com.ibm.icu.impl.TextTrieMap;
  import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier;
@@ -1129,7 +1130,7 @@ public class Parse {
              }
              acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
              acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
@@ -1155,7 +1156,7 @@ public class Parse {
              // We encountered a decimal point
              acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
              acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
              if (state.length > 0 && mode == ParseMode.FAST) break;
@@ -1184,7 +1185,7 @@ public class Parse {
              break;
  
            case AFTER_EXPONENT_DIGIT:
-            acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
              acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
              acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
              if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
@@ -1257,17 +1258,17 @@ public class Parse {
              break;
  
            case INSIDE_STRING:
-            acceptStringOffset(cp, state, item);
+            long added0 = acceptStringOffset(cp, state, item);
              // Accept arbitrary bidi in the middle of strings.
-            if (state.length == 0 && UNISET_BIDI.contains(cp)) {
+            if (added0 == 0L && UNISET_BIDI.contains(cp)) {
                state.getNext().copyFrom(item, item.name, cp);
              }
              break;
  
            case INSIDE_AFFIX_PATTERN:
-            long added = acceptAffixPatternOffset(cp, state, item);
-            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes.
-            if (added == 0L && isIgnorable(cp, state)) {
+            long added1 = acceptAffixPatternOffset(cp, state, item);
+            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns.
+            if (added1 == 0L && isIgnorable(cp, state)) {
                state.getNext().copyFrom(item, item.name, cp);
              }
              break;
@@ -1758,8 +1759,8 @@ public class Parse {
      if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
    }
  
-  private static void acceptStringOffset(int cp, ParserState state, StateItem item) {
-    acceptString(
+  private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
+    return acceptString(
          cp,
          item.returnTo1,
          item.returnTo2,
@@ -1813,20 +1814,20 @@ public class Parse {
        referenceCp = Character.codePointAt(str, offset);
        count = Character.charCount(referenceCp);
        equals = codePointEquals(cp, referenceCp, state);
-      if (!UNISET_BIDI.contains(cp)) break;
+      if (!UNISET_BIDI.contains(referenceCp)) break;
      }
  
      if (equals) {
        // Matches first code point of the string
        StateItem next = state.getNext().copyFrom(item, null, cp);
  
-      // Skip over ignorable code points in the middle of the string.
+      // Skip over bidi code points in the middle or end of the string.
        // They will be accepted in the main loop.
        offset += count;
        for (; offset < str.length(); offset += count) {
          referenceCp = Character.codePointAt(str, offset);
          count = Character.charCount(referenceCp);
-        if (!UNISET_BIDI.contains(cp)) break;
+        if (!UNISET_BIDI.contains(referenceCp)) break;
        }
  
        if (offset < str.length()) {
@@ -1879,7 +1880,7 @@ public class Parse {
        tag = AffixPatternUtils.nextToken(tag, str);
        typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
        hasNext = AffixPatternUtils.hasNext(tag, str);
-      if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break;
+      if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break;
      }
  
      // Convert from the returned tag to a code point, string, or currency to check
@@ -1926,18 +1927,16 @@ public class Parse {
      while (hasNext) {
        long futureTag = AffixPatternUtils.nextToken(tag, str);
        int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag);
-      if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break;
+      if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break;
        tag = futureTag;
        typeOrCp = futureTypeOrCp;
        hasNext = AffixPatternUtils.hasNext(tag, str);
      }
  
      long added = 0L;
-    if (resolvedCp >= 0) {
+    if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
        // Code point
-      if (!codePointEquals(cp, resolvedCp, state)) return 0L;
        StateItem next = state.getNext().copyFrom(item, null, cp);
-
        if (hasNext) {
          // Additional tokens in affix string.
          next.name = StateName.INSIDE_AFFIX_PATTERN;
@@ -1948,6 +1947,7 @@ public class Parse {
          next.trailingCount = 0;
          next.returnTo1 = null;
        }
+      next.score += 1; // reward for consuming code point from pattern
        added |= 1L << state.lastInsertedIndex();
      }
      if (resolvedMinusSign) {
@@ -2180,11 +2180,26 @@ public class Parse {
     *
     * @param cp The code point to test. Returns false if cp is negative.
     * @param state The current {@link ParserState}, used for determining strict mode.
-   * @return true if cp is bidi or whitespace in lenient mode; false otherwise.
+   * @return true if cp is ignorable; false otherwise.
     */
    private static boolean isIgnorable(int cp, ParserState state) {
      if (cp < 0) return false;
      if (UNISET_BIDI.contains(cp)) return true;
      return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
    }
+
+  /**
+   * Checks whether the given code point is "ignorable" in pattern syntax. This includes all
+   * characters that are normally ignorable in {@link #isIgnorable} plus characters having the
+   * Pattern_White_Space property.
+   *
+   * @param cp The code point to test. Returns false if cp is negative.
+   * @param state The current {@link ParserState}, used for determining strict mode.
+   * @return true if cp is ignorable; false otherwise.
+   */
+  private static boolean isPatternIgnorable(int cp, ParserState state) {
+    if (cp < 0) return false;
+    if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true;
+    return isIgnorable(cp, state);
+  }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java

index fbe9915b548349a8a034a5df4e6d8592487b3268..dbdb41fe578a2f0c7251b417af1c8739750a4b5b 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
@@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category;
   * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
   * parse failure by throwing a {@link java.text.ParseException}.
   *
+ * <p>Under the hood, a state table parsing engine is used. To debug a parsing failure during
+ * development, use the following pattern to print details about the state table transitions:
+ *
+ * <pre>
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = true;
+ * df.parse("123.45", ppos);
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = false;
+ * </pre>
+ *
   * <h3>Thread Safety and Best Practices</h3>
   *
   * <p>Starting with ICU 59, instance of DecimalFormat are thread-safe.
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt

index 0b4d412a89ce8ae3a992bddd919eb53017efa544..83ece37db3a730de1672528c4dd7422b250dda16 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
@@ -1439,3 +1439,62 @@ parse    output  breaks
  // K and J return null
   9 999 9999    JK
  
+test parse ignorables
+set locale ar
+// Note: Prefixes contain RLMs, as do some of the test cases.
+set pattern x a‎b0c df 
+set negativePrefix y g‎h
+set negativeSuffix i jk 
+begin
+parse  output  breaks
+x a‎b56c df  56
+x  a‎b56c df         56      K
+x ab56c df     56      K
+x ab56c df     56      JK
+x ab56c df     56      K
+x ab56 56      JK
+x a b56        56      JK
+56cdf  56      JK
+56c df 56      JK
+56cd f 56      JK
+56c‎d‎f    56      JK
+56cdf  56      JK
+56c d‎f      56      JK
+56‎c df      56      JK
+y g‎h56i jk  -56
+y  g‎h56i jk         -56     KS
+y gh56i jk     -56     K
+y gh56i jk     -56     JK
+y gh56i jk     -56     K
+y gh56 -56     JK
+y g h56        -56     JKS
+// S stops parsing at the 'i' for these and returns -56
+56ijk  -56     JK
+56i jk -56     JK
+56ij k -56     JK
+56i‎j‎k    -56     JK
+56ijk  -56     JK
+56i j‎k      -56     JK
+56‎i jk      -56     JK
+// S gets 56 (accepts ' ' gs grouping); J and K get null
+5 6    5       JKS
+5‎6  5       JK
+
+test parse spaces in grouping
+// This test gives the ideal behavior of these cases, which
+// none of the implementations currently support.
+set locale en
+set pattern #,##0
+begin
+parse  output  breaks
+// J and S get "12" here
+1 2    1       JS
+1 23   1       JS
+// K gets 1 here; doesn't pick up the grouping separator
+1 234  1234    K
+
+
+
+
+
+
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index 0ac2e58baccf639a9809a2aad947e7b17830bb4a..1ad5884c5cdaeb61f5170b4181de8bd113be8e29 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk {
      @Test
      public void Test11839() {
          DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH);
-        dfs.setMinusSign('∸');
-        dfs.setPlusSign('∔'); //  ∔  U+2214 DOT PLUS
+        dfs.setMinusSignString("a∸");
+        dfs.setPlusSignString("b∔"); //  ∔  U+2214 DOT PLUS
          DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs);
          String result = df.format(-1.234);
-        assertEquals("Locale-specific minus sign should be used", "1.23∸", result);
+        assertEquals("Locale-specific minus sign should be used", "1.23a∸", result);
          result = df.format(1.234);
-        assertEquals("Locale-specific plus sign should be used", "1.23∔", result);
+        assertEquals("Locale-specific plus sign should be used", "1.23b∔", result);
+        // Test round-trip with parse
+        expect2(df, -456, "456.00a∸");
+        expect2(df, 456, "456.00b∔");
      }
  
      @Test
@@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk {
          }
      }
  
+    @Test
+    public void testParseIgnorables() {
+        // Also see the test case "test parse ignorables" in numberformattestspecification.txt
+        DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance();
+        dfs.setPercentString("\u200E%\u200E");
+        DecimalFormat df = new DecimalFormat("0%;-0a", dfs);
+        ParsePosition ppos = new ParsePosition(0);
+        Number result = df.parse("42\u200E%\u200E ", ppos);
+        assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex());
+        ppos.setIndex(0);
+        result = df.parse("-42a\u200E ", ppos);
+        assertEquals("Should parse as percent", new BigDecimal("-0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex());
+    }
+
      @Test
      public void testSignificantDigitsMode() {
          String[][] allExpected = {
author	Shane Carr <shane@unicode.org>
	Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)
committer	Shane Carr <shane@unicode.org>
	Wed, 29 Mar 2017 02:31:49 +0000 (02:31 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history