ICU-13228 Adding more symbols to localized notation mapper function, including suppor...

author Shane Carr <shane@unicode.org>

Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)

committer Shane Carr <shane@unicode.org>

Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)
author Shane Carr <shane@unicode.org>
Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)
committer Shane Carr <shane@unicode.org>
Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)
diff --git a/icu4c/source/test/testdata/numberformattestspecification.txt b/icu4c/source/test/testdata/numberformattestspecification.txt

index 69478e4b07290200c43ef360471542d8662ed3dd..7e763681374b5af528fdfe80d67d2034a6a9be86 100644 (file)
--- a/icu4c/source/test/testdata/numberformattestspecification.txt
+++ b/icu4c/source/test/testdata/numberformattestspecification.txt
@@ -693,6 +693,18 @@ en #0%     0.4376  44%
  // This next test breaks JDK. JDK doesn't multiply by 100.
  fa     \u0025\u00a0\u0023\u0030        0.4376  \u200e\u066a\u00a0\u06f4\u06f4  K
  
+test localized pattern basic symbol coverage
+begin
+locale localizedPattern        toPattern       breaks
+it     #.##0,00        #,##0.00
+// JDK either doesn't know sl uses this character for minus sign
+// or doesn't support minus sign in localized pattern
+sl     #.##0;#.##0−  #,##0;#,##0-    K
+// JDK does not have data for "×10^" in this locale
+en_SE  0,00×10^0;0,00×10^0-  0.00E0;0.00E0-  K
+// JDK does not seem to transform the digits in localized patterns
+ar_SA  #\u066C##\u0660\u066B\u0660\u0660\u061Ba#       #,##0.00;a#,##0.00      K
+
  test toPattern
  set locale en
  begin
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/PatternString.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/PatternString.java

index c43f4a8df728823fc39f245023f687c194599ad8..18cdafa51cf0ccaef1b6d6200910bbc9b5ad2bef 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/PatternString.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/PatternString.java
@@ -278,6 +278,11 @@ public class PatternString {
     * pattern "0.000" means "decimal" in standard notation (as it does in every other locale), but it
     * means "grouping" in localized notation.
     *
+   * <p>A greedy string-substitution strategy is used to substitute locale symbols. If two symbols
+   * are ambiguous or have the same prefix, the result is not well-defined.
+   *
+   * <p>Locale symbols are not allowed to contain the ASCII quote character.
+   *
     * @param input The pattern to convert.
     * @param symbols The symbols corresponding to the localized pattern.
     * @param toLocalized true to convert from standard to localized notation; false to convert from
@@ -288,100 +293,136 @@ public class PatternString {
     */
    @Deprecated
    public static String convertLocalized(
-      CharSequence input, DecimalFormatSymbols symbols, boolean toLocalized) {
+      String input, DecimalFormatSymbols symbols, boolean toLocalized) {
      if (input == null) return null;
  
-    /// This is not the prettiest function in the world, but it gets the job done. ///
-
-    // Construct a table of code points to be converted between localized and standard.
-    int[][] table = new int[6][2];
+    // Construct a table of strings to be converted between localized and standard.
+    String[][] table = new String[21][2];
      int standIdx = toLocalized ? 0 : 1;
      int localIdx = toLocalized ? 1 : 0;
-    table[0][standIdx] = '%';
-    table[0][localIdx] = symbols.getPercent();
-    table[1][standIdx] = '‰';
-    table[1][localIdx] = symbols.getPerMill();
-    table[2][standIdx] = '.';
-    table[2][localIdx] = symbols.getDecimalSeparator();
-    table[3][standIdx] = ',';
-    table[3][localIdx] = symbols.getGroupingSeparator();
-    table[4][standIdx] = '-';
-    table[4][localIdx] = symbols.getMinusSign();
-    table[5][standIdx] = '+';
-    table[5][localIdx] = symbols.getPlusSign();
-
-    // Special case: localIdx characters are NOT allowed to be quotes, like in de_CH.
-    // Use '’' instead.
+    table[0][standIdx] = "%";
+    table[0][localIdx] = symbols.getPercentString();
+    table[1][standIdx] = "‰";
+    table[1][localIdx] = symbols.getPerMillString();
+    table[2][standIdx] = ".";
+    table[2][localIdx] = symbols.getDecimalSeparatorString();
+    table[3][standIdx] = ",";
+    table[3][localIdx] = symbols.getGroupingSeparatorString();
+    table[4][standIdx] = "-";
+    table[4][localIdx] = symbols.getMinusSignString();
+    table[5][standIdx] = "+";
+    table[5][localIdx] = symbols.getPlusSignString();
+    table[6][standIdx] = ";";
+    table[6][localIdx] = Character.toString(symbols.getPatternSeparator());
+    table[7][standIdx] = "@";
+    table[7][localIdx] = Character.toString(symbols.getSignificantDigit());
+    table[8][standIdx] = "E";
+    table[8][localIdx] = symbols.getExponentSeparator();
+    table[9][standIdx] = "*";
+    table[9][localIdx] = Character.toString(symbols.getPadEscape());
+    table[10][standIdx] = "#";
+    table[10][localIdx] = Character.toString(symbols.getDigit());
+    for (int i = 0; i < 10; i++) {
+      table[11 + i][standIdx] = Character.toString((char) ('0' + i));
+      table[11 + i][localIdx] = symbols.getDigitStringsLocal()[i];
+    }
+
+    // Special case: quotes are NOT allowed to be in any localIdx strings.
+    // Substitute them with '’' instead.
      for (int i = 0; i < table.length; i++) {
-      if (table[i][localIdx] == '\'') {
-        table[i][localIdx] = '’';
-      }
+      table[i][localIdx] = table[i][localIdx].replace('\'', '’');
      }
  
-    // Iterate through the string and convert
-    int offset = 0;
-    int state = 0;
+    // Iterate through the string and convert.
+    // State table:
+    //  0 => base state
+    //  1 => first char inside a quoted sequence in input and output string
+    //  2 => inside a quoted sequence in input and output string
+    //  3 => first char after a close quote in input string;
+    //       close quote still needs to be written to output string
+    //  4 => base state in input string; inside quoted sequence in output string
+    //  5 => first char inside a quoted sequence in input string;
+    //       inside quoted sequence in output string
      StringBuilder result = new StringBuilder();
-    for (; offset < input.length(); ) {
-      int cp = Character.codePointAt(input, offset);
-      int cpToAppend = cp;
-
-      if (state == 1 || state == 3 || state == 4) {
-        // Inside user-specified quote
-        if (cp == '\'') {
-          if (state == 1) {
-            state = 0;
-          } else if (state == 3) {
-            state = 2;
-            cpToAppend = -1;
-          } else {
-            state = 2;
-          }
+    int state = 0;
+    outer:
+    for (int offset = 0; offset < input.length(); offset++) {
+      char ch = input.charAt(offset);
+
+      // Handle a quote character (state shift)
+      if (ch == '\'') {
+        if (state == 0) {
+          result.append('\'');
+          state = 1;
+          continue;
+        } else if (state == 1) {
+          result.append('\'');
+          state = 0;
+          continue;
+        } else if (state == 2) {
+          state = 3;
+          continue;
+        } else if (state == 3) {
+          result.append('\'');
+          result.append('\'');
+          state = 1;
+          continue;
+        } else if (state == 4) {
+          state = 5;
+          continue;
+        } else {
+          assert state == 5;
+          result.append('\'');
+          result.append('\'');
+          state = 4;
+          continue;
          }
-      } else {
-        // Base state or inside special character quote
-        if (cp == '\'') {
-          if (state == 2 && offset + 1 < input.length()) {
-            int nextCp = Character.codePointAt(input, offset + 1);
-            if (nextCp == '\'') {
-              // escaped quote
-              state = 4;
-            } else {
-              // begin user-specified quote sequence
-              // we are already in a quote sequence, so omit the opening quote
-              state = 3;
-              cpToAppend = -1;
+      }
+
+      if (state == 0 || state == 3 || state == 4) {
+        for (String[] pair : table) {
+          // Perform a greedy match on this symbol string
+          if (input.regionMatches(offset, pair[0], 0, pair[0].length())) {
+            // Skip ahead past this region for the next iteration
+            offset += pair[0].length() - 1;
+            if (state == 3 || state == 4) {
+              result.append('\'');
+              state = 0;
              }
-          } else {
-            state = 1;
+            result.append(pair[1]);
+            continue outer;
            }
-        } else {
-          boolean needsSpecialQuote = false;
-          for (int i = 0; i < table.length; i++) {
-            if (table[i][0] == cp) {
-              cpToAppend = table[i][1];
-              needsSpecialQuote = false; // in case an earlier translation triggered it
-              break;
-            } else if (table[i][1] == cp) {
-              needsSpecialQuote = true;
+        }
+        // No replacement found.  Check if a special quote is necessary
+        for (String[] pair : table) {
+          if (input.regionMatches(offset, pair[1], 0, pair[1].length())) {
+            if (state == 0) {
+              result.append('\'');
+              state = 4;
              }
-          }
-          if (state == 0 && needsSpecialQuote) {
-            state = 2;
-            result.appendCodePoint('\'');
-          } else if (state == 2 && !needsSpecialQuote) {
-            state = 0;
-            result.appendCodePoint('\'');
+            result.append(ch);
+            continue outer;
            }
          }
+        // Still nothing.  Copy the char verbatim.  (Add a close quote if necessary)
+        if (state == 3 || state == 4) {
+          result.append('\'');
+          state = 0;
+        }
+        result.append(ch);
+      } else {
+        assert state == 1 || state == 2 || state == 5;
+        result.append(ch);
+        state = 2;
        }
-      if (cpToAppend != -1) {
-        result.appendCodePoint(cpToAppend);
-      }
-      offset += Character.charCount(cp);
      }
-    if (state == 2) {
-      result.appendCodePoint('\'');
+    // Resolve final quotes
+    if (state == 3 || state == 4) {
+      result.append('\'');
+      state = 0;
+    }
+    if (state != 0) {
+      throw new IllegalArgumentException("Malformed localized pattern: unterminated quote");
      }
      return result.toString();
    }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt

index 69478e4b07290200c43ef360471542d8662ed3dd..7e763681374b5af528fdfe80d67d2034a6a9be86 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
@@ -693,6 +693,18 @@ en #0%     0.4376  44%
  // This next test breaks JDK. JDK doesn't multiply by 100.
  fa     \u0025\u00a0\u0023\u0030        0.4376  \u200e\u066a\u00a0\u06f4\u06f4  K
  
+test localized pattern basic symbol coverage
+begin
+locale localizedPattern        toPattern       breaks
+it     #.##0,00        #,##0.00
+// JDK either doesn't know sl uses this character for minus sign
+// or doesn't support minus sign in localized pattern
+sl     #.##0;#.##0−  #,##0;#,##0-    K
+// JDK does not have data for "×10^" in this locale
+en_SE  0,00×10^0;0,00×10^0-  0.00E0;0.00E0-  K
+// JDK does not seem to transform the digits in localized patterns
+ar_SA  #\u066C##\u0660\u066B\u0660\u0660\u061Ba#       #,##0.00;a#,##0.00      K
+
  test toPattern
  set locale en
  begin
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatDataDrivenTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatDataDrivenTest.java

index 58fb5f8b38c1991e85a9d656116476642360d7dc..dd1ecaf7a4325f4e28fb56b450be48e3041d271a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatDataDrivenTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatDataDrivenTest.java
@@ -753,7 +753,10 @@ public class NumberFormatDataDrivenTest {
              properties.setNegativeSuffix(tuple.negativeSuffix);
            }
            if (tuple.localizedPattern != null) {
-            // TODO
+            DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(tuple.locale);
+            String converted =
+                PatternString.convertLocalized(tuple.localizedPattern, symbols, false);
+            PatternString.parseToExistingProperties(converted, properties);
            }
            if (tuple.lenient != null) {
              properties.setParseMode(tuple.lenient == 0 ? ParseMode.STRICT : ParseMode.LENIENT);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java

index 312e70c3da11a13daa38966b5436b28f3885232e..5f173035b2dc978f41b68e622fdcdbea76f5db47 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@@ -51,6 +51,7 @@ import com.ibm.icu.text.DecimalFormat;
  import com.ibm.icu.text.DecimalFormat.PropertySetter;
  import com.ibm.icu.text.DecimalFormat.SignificantDigitsMode;
  import com.ibm.icu.text.DecimalFormatSymbols;
+import com.ibm.icu.text.DecimalFormat_ICU58;
  import com.ibm.icu.text.DisplayContext;
  import com.ibm.icu.text.MeasureFormat;
  import com.ibm.icu.text.NumberFormat;
@@ -1597,6 +1598,59 @@ public class NumberFormatTest extends TestFmwk {
          }
      }
  
+    @Test
+    public void TestLocalizedPatternSymbolCoverage() {
+        String[] standardPatterns = { "#,##0.05+%;#,##0.05-%", "* @@@E0‰" };
+        String[] standardPatterns58 = { "#,##0.05+%;#,##0.05-%", "* @@@E0‰;* -@@@E0‰" };
+        String[] localizedPatterns = { "▰⁖▰▰໐⁘໐໕†⁜⁙▰⁖▰▰໐⁘໐໕‡⁜", "⁂ ⁕⁕⁕⁑⁑໐‱" };
+        String[] localizedPatterns58 = { "▰⁖▰▰໐⁘໐໕+⁜⁙▰⁖▰▰໐⁘໐໕‡⁜", "⁂ ⁕⁕⁕⁑⁑໐‱⁙⁂ ‡⁕⁕⁕⁑⁑໐‱" };
+
+        DecimalFormatSymbols dfs = new DecimalFormatSymbols();
+        dfs.setGroupingSeparator('⁖');
+        dfs.setDecimalSeparator('⁘');
+        dfs.setPatternSeparator('⁙');
+        dfs.setDigit('▰');
+        dfs.setZeroDigit('໐');
+        dfs.setSignificantDigit('⁕');
+        dfs.setPlusSign('†');
+        dfs.setMinusSign('‡');
+        dfs.setPercent('⁜');
+        dfs.setPerMill('‱');
+        dfs.setExponentSeparator("⁑⁑"); // tests multi-char sequence
+        dfs.setPadEscape('⁂');
+
+        for (int i=0; i<2; i++) {
+            String standardPattern = standardPatterns[i];
+            String standardPattern58 = standardPatterns58[i];
+            String localizedPattern = localizedPatterns[i];
+            String localizedPattern58 = localizedPatterns58[i];
+
+            DecimalFormat df1 = new DecimalFormat("#", dfs);
+            df1.applyPattern(standardPattern);
+            DecimalFormat df2 = new DecimalFormat("#", dfs);
+            df2.applyLocalizedPattern(localizedPattern);
+            assertEquals("DecimalFormat instances should be equal",
+                    df1, df2);
+            assertEquals("toPattern should match on localizedPattern instance",
+                    standardPattern, df2.toPattern());
+            assertEquals("toLocalizedPattern should match on standardPattern instance",
+                    localizedPattern, df1.toLocalizedPattern());
+
+            // Note: ICU 58 does not support plus signs in patterns
+            // Note: ICU 58 always prints the negative part of scientific notation patterns,
+            //       even when the negative part is not necessary
+            DecimalFormat_ICU58 df3 = new DecimalFormat_ICU58("#", dfs);
+            df3.applyPattern(standardPattern); // Reading standardPattern is OK
+            DecimalFormat_ICU58 df4 = new DecimalFormat_ICU58("#", dfs);
+            df4.applyLocalizedPattern(localizedPattern58);
+            // Note: DecimalFormat#equals() is broken on ICU 58
+            assertEquals("toPattern should match on ICU58 localizedPattern instance",
+                    standardPattern58, df4.toPattern());
+            assertEquals("toLocalizedPattern should match on ICU58 standardPattern instance",
+                    localizedPattern58, df3.toLocalizedPattern());
+        }
+    }
+
      @Test
      public void TestParseNull() throws ParseException {
          DecimalFormat df = new DecimalFormat();
author	Shane Carr <shane@unicode.org>
	Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)
committer	Shane Carr <shane@unicode.org>
	Wed, 21 Jun 2017 00:38:25 +0000 (00:38 +0000)
icu4c/source/test/testdata/numberformattestspecification.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/number/PatternString.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatDataDrivenTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java		patch \| blob \| history