ICU-11230 Change number parsing to require at least 2 digits in a group after the...

author Shane Carr <shane@unicode.org>

Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)

committer Shane Carr <shane@unicode.org>

Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)
author Shane Carr <shane@unicode.org>
Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)
committer Shane Carr <shane@unicode.org>
Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)
diff --git a/icu4c/source/i18n/number_decimalquantity.cpp b/icu4c/source/i18n/number_decimalquantity.cpp

index 738b075a064e2ddbc31eea71bc80b0c93170ce9b..65a267b561c2bc9ea0a3e9c6abfa0558a9af05c6 100644 (file)
--- a/icu4c/source/i18n/number_decimalquantity.cpp
+++ b/icu4c/source/i18n/number_decimalquantity.cpp
@@ -603,6 +603,14 @@ void DecimalQuantity::toDecNum(DecNum& output, UErrorCode& status) const {
      output.setTo(ubcd.getAlias(), precision, scale, isNegative(), status);
  }
  
+void DecimalQuantity::truncate() {
+    if (scale < 0) {
+        shiftRight(-scale);
+        scale = 0;
+        compact();
+    }
+}
+
  void DecimalQuantity::roundToMagnitude(int32_t magnitude, RoundingMode roundingMode, UErrorCode& status) {
      // The position in the BCD at which rounding will be performed; digits to the right of position
      // will be rounded away.
diff --git a/icu4c/source/i18n/number_decimalquantity.h b/icu4c/source/i18n/number_decimalquantity.h

index be961629b4964e871c1e21950df225b140b37af6..7619ce2e4d1e07e15ef40144d0a50bc2356bb5a4 100644 (file)
--- a/icu4c/source/i18n/number_decimalquantity.h
+++ b/icu4c/source/i18n/number_decimalquantity.h
@@ -81,6 +81,9 @@ class U_I18N_API DecimalQuantity : public IFixedDecimal, public UMemory {
      void roundToIncrement(double roundingIncrement, RoundingMode roundingMode,
                            int32_t maxFrac, UErrorCode& status);
  
+    /** Removes all fraction digits. */
+    void truncate();
+
      /**
       * Rounds the number to a specified magnitude (power of ten).
       *
diff --git a/icu4c/source/i18n/numparse_decimal.cpp b/icu4c/source/i18n/numparse_decimal.cpp

index a8d63c3f91e4ce45f3aba417baad70c0dbdee36c..74da8f310835d6dddde00d90f5b1da805cc54227 100644 (file)
--- a/icu4c/source/i18n/numparse_decimal.cpp
+++ b/icu4c/source/i18n/numparse_decimal.cpp
@@ -108,6 +108,7 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
      UnicodeString actualDecimalString = decimalSeparator;
      int32_t groupedDigitCount = 0; // tracking count of digits delimited by grouping separator
      int32_t backupOffset = -1; // used for preserving the last confirmed position
+    int32_t smallGroupBackupOffset = -1; // used to back up behind groups of size 1
      bool afterFirstGrouping = false;
      bool seenGrouping = false;
      bool seenDecimal = false;
@@ -147,6 +148,8 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
              // Digit was found.
              // Check for grouping size violation
              if (backupOffset != -1) {
+                smallGroupBackupOffset = backupOffset;
+                backupOffset = -1;
                  if (requireGroupingMatch) {
                      // comma followed by digit, so group before comma is a secondary
                      // group. If there was a group separator before that, the group
@@ -157,9 +160,14 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
                          strictFail = true;
                          break;
                      }
+                } else {
+                    // #11230: don't accept groups after the first with only 1 digit.
+                    // The logic to back up and remove the lone digit is lower down.
+                    if (afterFirstGrouping && groupedDigitCount == 1) {
+                        break;
+                    }
                  }
                  afterFirstGrouping = true;
-                backupOffset = -1;
                  groupedDigitCount = 0;
              }
  
@@ -266,6 +274,30 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
          strictFail = true;
      }
  
+    // #11230: don't accept groups after the first with only 1 digit.
+    // Behavior in this case is to back up before that 1-digit group.
+    if (!seenDecimal && afterFirstGrouping && groupedDigitCount == 1) {
+        if (segment.length() == 0) {
+            // Strings like "9,999" where we looked at only the first 3 chars.
+            // Ask for a longer segment.
+            hasPartialPrefix = true;
+        }
+        segment.setOffset(smallGroupBackupOffset);
+        result.setCharsConsumed(segment);
+        if (smallGroupBackupOffset == initialOffset) {
+            // Strings like ",9"
+            // Reset to no quantity seen.
+            result.quantity.clear();
+            result.quantity.bogus = true;
+        } else {
+            // Strings like "9,9"
+            // Remove the lone digit from the result quantity.
+            U_ASSERT(!result.quantity.bogus);
+            result.quantity.adjustMagnitude(-1);
+            result.quantity.truncate();
+        }
+    }
+
      if (requireGroupingMatch && strictFail) {
          result = backupResult;
          segment.setOffset(initialOffset);
diff --git a/icu4c/source/test/testdata/numberformattestspecification.txt b/icu4c/source/test/testdata/numberformattestspecification.txt

index 7836662fed08f3d87f960cf4c297046beb474113..f9b3c7e339425681209edb943ecd5ed68de88cbf 100644 (file)
--- a/icu4c/source/test/testdata/numberformattestspecification.txt
+++ b/icu4c/source/test/testdata/numberformattestspecification.txt
@@ -774,17 +774,18 @@ parse     output  breaks
  +1,23,4567.89,01       1234567.89
  +1,23,456.78.9 123456.78
  +12.34,56      12.34
-+79,,20,3      79203
-+79  20 3      79203   K
++79,,20,33     792033
+// JDK gets 79
++79  20 33     792033  K
  // Parsing stops at comma as it is different from other separators
-+79  20,3      7920    K
-+79,,20 3      7920
++79  20,33     7920    K
++79,,20 33     7920
  +  79  79      K
-+,79,,20,3     79203
++,79,,20,33    792033
  +7920d3        7920
  // Whitespace immediately after prefix doesn't count as digit separator
  // in C but is does in H
-+ ,79,,20,3    79203   HK
++ ,79,,20,33   792033  HK
  (  19 45)      -1945   K
  // C allows trailing separators when there is a prefix and suffix.
  // H allows trailing separators only when there is just a prefix.
@@ -999,7 +1000,7 @@ parse      output  breaks
  १३.३१‍       13.31   
  123'456        123456
  524'1.3        5241.3
-३'१‍     31
+३'११‍  311
  
  test parse with European-style comma/period
  set locale pt
@@ -1468,13 +1469,33 @@ maxFractionDigits       format  output  breaks
  test ticket 11230
  set locale en
  begin
-pattern        parse   output  breaks
-// K and J return null; S, C, and P return 99
-#,##0   9 9    99      K
-// K returns null
-#,##0   9 999  9999    K
-0       9 9    9       HK
-0       9 999  9       HK
+pattern        lenient parse   output  breaks
+// Groups after the first group need 2 digits to be accepted.
+// JDK does not see space as grouping and parses most of these as 9.
+#,##0  1       9 9     9       H
+#,##0  1       9 99    999     K
+#,##0  1       9 999   9999    K
+#,##0  1       9 9 9   9       H
+#,##0  1       ,9      fail    HK
+#,##0  1       99,.0   99
+0      1       9 9     9
+0      1       9 99    9
+0      1       9 999   9
+0      1       9 9 9   9
+0      1       ,9      fail
+0      1       99,.0   99
+#,##0  0       9 9     fail    K
+#,##0  0       9 99    fail    K
+#,##0  0       9 999   9999    K
+#,##0  0       9 9 9   fail    K
+#,##0  0       ,9      fail    K
+#,##0  0       99,.0   fail    K
+0      0       9 9     9
+0      0       9 99    9
+0      0       9 999   9
+0      0       9 9 9   9
+0      0       ,9      fail
+0      0       99,.0   99
  
  test parse ignorables
  set locale ar
@@ -1521,13 +1542,13 @@ y g h56 -56     HK
  test parse spaces in grouping
  // This test gives the ideal behavior of these cases, which
  // none of the implementations currently support.
+// Similar to the test above for ticket #11230
  set locale en
  set pattern #,##0
  begin
  parse  output  breaks
-// C, H, S, and P get "12" here
-1 2    1       CHJP
-1 23   1       CHJP
+1 2    1       H
+1 23   123     K
  // K gets 1 here; doesn't pick up the grouping separator
  1 234  1234    K
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/DecimalQuantity_AbstractBCD.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/DecimalQuantity_AbstractBCD.java

index 567a17ce0d51b1f0014bfec29dba357380585ab0..00a64bbe2ff09d3b9bc46fc4700f6d744a30d02d 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/DecimalQuantity_AbstractBCD.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/DecimalQuantity_AbstractBCD.java
@@ -732,6 +732,15 @@ public abstract class DecimalQuantity_AbstractBCD implements DecimalQuantity {
      private static final int SECTION_LOWER_EDGE = -1;
      private static final int SECTION_UPPER_EDGE = -2;
  
+    /** Removes all fraction digits. */
+    public void truncate() {
+        if (scale < 0) {
+            shiftRight(-scale);
+            scale = 0;
+            compact();
+        }
+    }
+
      @Override
      public void roundToMagnitude(int magnitude, MathContext mathContext) {
          // The position in the BCD at which rounding will be performed; digits to the right of position
@@ -1042,7 +1051,7 @@ public abstract class DecimalQuantity_AbstractBCD implements DecimalQuantity {
       * is the caller's responsibility to follow-up with a call to {@link #compact}.
       *
       * @param numDigits
-     *            The number of zeros to add.
+     *            The number of digits to remove.
       */
      protected abstract void shiftRight(int numDigits);
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java

index 91595de606510c1f15d20c4e8a8ba4de3c2add88..a7422c9a52f24a6c77712def6d6cb72af2cd0b2a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java
@@ -2,9 +2,9 @@
  // License & terms of use: http://www.unicode.org/copyright.html#License
  package com.ibm.icu.impl.number.parse;
  
-import com.ibm.icu.impl.StringSegment;
  import com.ibm.icu.impl.StaticUnicodeSets;
  import com.ibm.icu.impl.StaticUnicodeSets.Key;
+import com.ibm.icu.impl.StringSegment;
  import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
  import com.ibm.icu.impl.number.Grouper;
  import com.ibm.icu.lang.UCharacter;
@@ -132,6 +132,7 @@ public class DecimalMatcher implements NumberParseMatcher {
          String actualDecimalString = decimalSeparator;
          int groupedDigitCount = 0; // tracking count of digits delimited by grouping separator
          int backupOffset = -1; // used for preserving the last confirmed position
+        int smallGroupBackupOffset = -1; // used to back up behind groups of size 1
          boolean afterFirstGrouping = false;
          boolean seenGrouping = false;
          boolean seenDecimal = false;
@@ -171,6 +172,8 @@ public class DecimalMatcher implements NumberParseMatcher {
                  // Digit was found.
                  // Check for grouping size violation
                  if (backupOffset != -1) {
+                    smallGroupBackupOffset = backupOffset;
+                    backupOffset = -1;
                      if (requireGroupingMatch) {
                          // comma followed by digit, so group before comma is a secondary
                          // group. If there was a group separator before that, the group
@@ -181,9 +184,14 @@ public class DecimalMatcher implements NumberParseMatcher {
                              strictFail = true;
                              break;
                          }
+                    } else {
+                        // #11230: don't accept groups after the first with only 1 digit.
+                        // The logic to back up and remove the lone digit is lower down.
+                        if (afterFirstGrouping && groupedDigitCount == 1) {
+                            break;
+                        }
                      }
                      afterFirstGrouping = true;
-                    backupOffset = -1;
                      groupedDigitCount = 0;
                  }
  
@@ -294,6 +302,29 @@ public class DecimalMatcher implements NumberParseMatcher {
              strictFail = true;
          }
  
+        // #11230: don't accept groups after the first with only 1 digit.
+        // Behavior in this case is to back up before that 1-digit group.
+        if (!seenDecimal && afterFirstGrouping && groupedDigitCount == 1) {
+            if (segment.length() == 0) {
+                // Strings like "9,999" where we looked at only the first 3 chars.
+                // Ask for a longer segment.
+                hasPartialPrefix = true;
+            }
+            segment.setOffset(smallGroupBackupOffset);
+            result.setCharsConsumed(segment);
+            if (smallGroupBackupOffset == initialOffset) {
+                // Strings like ",9"
+                // Reset to no quantity seen.
+                result.quantity = null;
+            } else {
+                // Strings like "9,9"
+                // Remove the lone digit from the result quantity.
+                assert result.quantity != null;
+                result.quantity.adjustMagnitude(-1);
+                result.quantity.truncate();
+            }
+        }
+
          if (requireGroupingMatch && strictFail) {
              result.copyFrom(backupResult);
              segment.setOffset(initialOffset);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt

index 7836662fed08f3d87f960cf4c297046beb474113..f9b3c7e339425681209edb943ecd5ed68de88cbf 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
@@ -774,17 +774,18 @@ parse     output  breaks
  +1,23,4567.89,01       1234567.89
  +1,23,456.78.9 123456.78
  +12.34,56      12.34
-+79,,20,3      79203
-+79  20 3      79203   K
++79,,20,33     792033
+// JDK gets 79
++79  20 33     792033  K
  // Parsing stops at comma as it is different from other separators
-+79  20,3      7920    K
-+79,,20 3      7920
++79  20,33     7920    K
++79,,20 33     7920
  +  79  79      K
-+,79,,20,3     79203
++,79,,20,33    792033
  +7920d3        7920
  // Whitespace immediately after prefix doesn't count as digit separator
  // in C but is does in H
-+ ,79,,20,3    79203   HK
++ ,79,,20,33   792033  HK
  (  19 45)      -1945   K
  // C allows trailing separators when there is a prefix and suffix.
  // H allows trailing separators only when there is just a prefix.
@@ -999,7 +1000,7 @@ parse      output  breaks
  १३.३१‍       13.31   
  123'456        123456
  524'1.3        5241.3
-३'१‍     31
+३'११‍  311
  
  test parse with European-style comma/period
  set locale pt
@@ -1468,13 +1469,33 @@ maxFractionDigits       format  output  breaks
  test ticket 11230
  set locale en
  begin
-pattern        parse   output  breaks
-// K and J return null; S, C, and P return 99
-#,##0   9 9    99      K
-// K returns null
-#,##0   9 999  9999    K
-0       9 9    9       HK
-0       9 999  9       HK
+pattern        lenient parse   output  breaks
+// Groups after the first group need 2 digits to be accepted.
+// JDK does not see space as grouping and parses most of these as 9.
+#,##0  1       9 9     9       H
+#,##0  1       9 99    999     K
+#,##0  1       9 999   9999    K
+#,##0  1       9 9 9   9       H
+#,##0  1       ,9      fail    HK
+#,##0  1       99,.0   99
+0      1       9 9     9
+0      1       9 99    9
+0      1       9 999   9
+0      1       9 9 9   9
+0      1       ,9      fail
+0      1       99,.0   99
+#,##0  0       9 9     fail    K
+#,##0  0       9 99    fail    K
+#,##0  0       9 999   9999    K
+#,##0  0       9 9 9   fail    K
+#,##0  0       ,9      fail    K
+#,##0  0       99,.0   fail    K
+0      0       9 9     9
+0      0       9 99    9
+0      0       9 999   9
+0      0       9 9 9   9
+0      0       ,9      fail
+0      0       99,.0   99
  
  test parse ignorables
  set locale ar
@@ -1521,13 +1542,13 @@ y g h56 -56     HK
  test parse spaces in grouping
  // This test gives the ideal behavior of these cases, which
  // none of the implementations currently support.
+// Similar to the test above for ticket #11230
  set locale en
  set pattern #,##0
  begin
  parse  output  breaks
-// C, H, S, and P get "12" here
-1 2    1       CHJP
-1 23   1       CHJP
+1 2    1       H
+1 23   123     K
  // K gets 1 here; doesn't pick up the grouping separator
  1 234  1234    K
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index 604f4c6d820eedf9a79ad66ad987fdaa4618303b..2f22d9d591450e51dd19b388200ef2a7448b89e7 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -2846,6 +2846,7 @@ public class NumberFormatTest extends TestFmwk {
  
      @Test
      public void TestStrictParse() {
+        // Pass both strict and lenient:
          String[] pass = {
                  "0",           // single zero before end of text is not leading
                  "0 ",          // single zero at end of number is not leading
@@ -2869,10 +2870,9 @@ public class NumberFormatTest extends TestFmwk {
                  "-999,999",    // see ticket #6863
                  "-9,999,999",  // see ticket #6863
          };
+        // Pass lenient, fail strict:
          String[] fail = {
                  "1,2",       // wrong number of digits after group separator
-                ",0",        // leading group separator before zero
-                ",1",        // leading group separator before digit
                  ",.02",      // leading group separator before decimal
                  "1,.02",     // group separator before decimal
                  "1,45",      // wrong number of digits in primary group
@@ -2882,9 +2882,14 @@ public class NumberFormatTest extends TestFmwk {
                  "12,34,567", // wrong number of digits in secondary group
                  "1,23,456,7890", // wrong number of digits in primary and secondary groups
          };
+        // Fail both lenient and strict:
+        String[] failBoth = {
+                ",0",        // leading group separator before a single digit
+                ",1",        // leading group separator before a single digit
+        };
  
          DecimalFormat nf = (DecimalFormat) NumberFormat.getInstance(Locale.ENGLISH);
-        runStrictParseBatch(nf, pass, fail);
+        runStrictParseBatch(nf, pass, fail, failBoth);
  
          String[] scientificPass = {
                  "0E2",      // single zero before exponent is ok
@@ -2894,9 +2899,11 @@ public class NumberFormatTest extends TestFmwk {
          };
          String[] scientificFail = {
          };
+        String[] scientificFailBoth = {
+        };
  
          nf = (DecimalFormat) NumberFormat.getInstance(Locale.ENGLISH);
-        runStrictParseBatch(nf, scientificPass, scientificFail);
+        runStrictParseBatch(nf, scientificPass, scientificFail, scientificFailBoth);
  
          String[] mixedPass = {
                  "12,34,567",
@@ -2910,18 +2917,22 @@ public class NumberFormatTest extends TestFmwk {
                  "12,34,56, that ",
                  "12,34,56 that",
          };
+        String[] mixedFailBoth = {
+        };
  
          nf = new DecimalFormat("#,##,##0.#");
-        runStrictParseBatch(nf, mixedPass, mixedFail);
+        runStrictParseBatch(nf, mixedPass, mixedFail, mixedFailBoth);
      }
  
-    void runStrictParseBatch(DecimalFormat nf, String[] pass, String[] fail) {
+    void runStrictParseBatch(DecimalFormat nf, String[] pass, String[] fail, String[] failBoth) {
          nf.setParseStrict(false);
          runStrictParseTests("should pass", nf, pass, true);
          runStrictParseTests("should also pass", nf, fail, true);
+        runStrictParseTests("should fail", nf, failBoth, false);
          nf.setParseStrict(true);
          runStrictParseTests("should still pass", nf, pass, true);
          runStrictParseTests("should fail", nf, fail, false);
+        runStrictParseTests("should still fail", nf, failBoth, false);
      }
  
      void runStrictParseTests(String msg, DecimalFormat nf, String[] tests, boolean pass) {
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RBNFParseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RBNFParseTest.java

index daf1219158fdbeb633ab1ac14cc9f1762cddd1ae..bd994eb140b1189a42e30196ceab1a0a36604ecd 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RBNFParseTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RBNFParseTest.java
@@ -142,12 +142,12 @@ public class RBNFParseTest extends TestFmwk {
          logln(n.toString());
  
          String[][] lists = {
-            { "1,2", "twelve", "un virgule deux" },
-            { "1,2 million", "twelve million", "un virgule deux" },
-            { "1,2 millions", "twelve million", "un million deux cent mille" },
-            { "1.2", "one point two", "douze" },
-            { "1.2 million", "one million two hundred thousand", "douze" },
-            { "1.2 millions", "one million two hundred thousand", "douze millions" },
+            { "1,2", "one", "un virgule deux" },
+            { "1,2 million", "one", "un virgule deux" },
+            { "1,2 millions", "one", "un million deux cent mille" },
+            { "1.2", "one point two", "un" },
+            { "1.2 million", "one million two hundred thousand", "un" },
+            { "1.2 millions", "one million two hundred thousand", "un" },
          };
  
          Locale.setDefault(Locale.FRANCE);
author	Shane Carr <shane@unicode.org>
	Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)
committer	Shane Carr <shane@unicode.org>
	Fri, 18 May 2018 00:52:43 +0000 (00:52 +0000)
icu4c/source/i18n/number_decimalquantity.cpp		patch \| blob \| history
icu4c/source/i18n/number_decimalquantity.h		patch \| blob \| history
icu4c/source/i18n/numparse_decimal.cpp		patch \| blob \| history
icu4c/source/test/testdata/numberformattestspecification.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/DecimalQuantity_AbstractBCD.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/DecimalMatcher.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RBNFParseTest.java		patch \| blob \| history