ICU-13177 Moving "codePointZero" optimization from PositiveDecimalFormat.java to...

author Shane Carr <shane@unicode.org>

Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)

committer Shane Carr <shane@unicode.org>

Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)
author Shane Carr <shane@unicode.org>
Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)
committer Shane Carr <shane@unicode.org>
Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/formatters/PositiveDecimalFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/formatters/PositiveDecimalFormat.java

index b4a33dcc8ac17411772ee8a8835c9207498b7078..fdb26062f4c07b351373827605312db0f98f1ce1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/formatters/PositiveDecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/formatters/PositiveDecimalFormat.java
@@ -123,26 +123,14 @@ public class PositiveDecimalFormat implements Format.TargetFormat {
        decimalSeparator = symbols.getDecimalSeparatorString();
      }
  
-    // Check to see if we can use code points instead of strings (~15% format performance boost)
-    int _codePointZero = -1;
-    String[] _digitStrings = symbols.getDigitStringsLocal();
-    for (int i = 0; i < _digitStrings.length; i++) {
-      int cp = Character.codePointAt(_digitStrings[i], 0);
-      int cc = Character.charCount(cp);
-      if (cc != _digitStrings[i].length()) {
-        _codePointZero = -1;
-        break;
-      } else if (i == 0) {
-        _codePointZero = cp;
-      } else if (cp != _codePointZero + i) {
-        _codePointZero = -1;
-        break;
-      }
-    }
+    // Check to see if we can use code points instead of strings
+    int _codePointZero = symbols.getCodePointZero();
      if (_codePointZero != -1) {
+      // Fast Path (~9% faster than slow path when formatting long strings)
        digitStrings = null;
        codePointZero = _codePointZero;
      } else {
+      // Slow Path
        digitStrings = symbols.getDigitStrings(); // makes a copy
        codePointZero = -1;
      }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormatSymbols.java b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormatSymbols.java

index 07e812bcd2a8555aa5aa337826f9802ae3e5a31b..e2354c1d213a6387dd0f5dfa53ca928acb9bd8e3 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormatSymbols.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormatSymbols.java
@@ -215,6 +215,9 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
                  digits[i] = d;
              }
          }
+
+        // Update codePointZero: it is simply zeroDigit.
+        codePointZero = zeroDigit;
      }
  
      /**
@@ -231,6 +234,11 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
      /**
       * Returns the array of strings used as digits, in order from 0 through 9
       * Package private method - doesn't create a defensively copy.
+     *
+     * <p><strong>WARNING:</strong> Mutating the returned array will cause undefined behavior.
+     * If you need to change the value of the array, use {@link #getDigitStrings} and {@link
+     * #setDigitStrings} instead.
+     *
       * @return the array of digit strings
       * @internal
       * @deprecated This API is ICU internal only.
@@ -240,6 +248,19 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
          return digitStrings;
      }
  
+    /**
+     * If the digit strings array corresponds to a sequence of increasing code points, this method
+     * returns the code point corresponding to the first entry in the digit strings array. If the
+     * digit strings array is <em>not</em> a sequence of increasing code points, returns -1.
+     *
+     * @internal
+     * @deprecated This API is ICU internal only.
+     */
+    @Deprecated
+    public int getCodePointZero() {
+        return codePointZero;
+    }
+
      /**
      * {@icu} Sets the array of strings used as digits, in order from 0 through 9
      * <p>
@@ -266,22 +287,48 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
          }
  
          // Scan input array and create char[] representation if possible
+        // Also update codePointZero if possible
          String[] tmpDigitStrings = new String[10];
          char[] tmpDigits = new char[10];
+        int tmpCodePointZero = -1;
          for (int i = 0; i < 10; i++) {
-            if (digitStrings[i] == null) {
+            String digitStr = digitStrings[i];
+            if (digitStr == null) {
                  throw new IllegalArgumentException("The input digit string array contains a null element");
              }
-            tmpDigitStrings[i] = digitStrings[i];
-            if (tmpDigits != null && digitStrings[i].length() == 1) {
-                tmpDigits[i] = digitStrings[i].charAt(0);
+            tmpDigitStrings[i] = digitStr;
+            int cp, cc;
+            if (digitStr.length() == 0) {
+                cp = -1;
+                cc = 0;
              } else {
-                // contains digit string with multiple UTF-16 code units
+                cp = Character.codePointAt(digitStrings[i], 0);
+                cc = Character.charCount(cp);
+            }
+            if (cc == digitStr.length()) {
+                // One code point in this digit.
+                // If it is 1 UTF-16 code unit long, set it in tmpDigits.
+                if (cc == 1) {
+                    tmpDigits[i] = (char) cp;
+                } else {
+                    tmpDigits = null;
+                }
+                // Check for validity of tmpCodePointZero.
+                if (i == 0) {
+                    tmpCodePointZero = cp;
+                } else if (cp != tmpCodePointZero + i) {
+                    tmpCodePointZero = -1;
+                }
+            } else {
+                // More than one code point in this digit.
+                // codePointZero and tmpDigits are going to be invalid.
+                tmpCodePointZero = -1;
                  tmpDigits = null;
              }
          }
  
          this.digitStrings = tmpDigitStrings;
+        this.codePointZero = tmpCodePointZero;
  
          if (tmpDigits == null) {
              // fallback to the default digit chars
@@ -1545,8 +1592,11 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
  
          serialVersionOnStream = currentSerialVersion;
  
-    // recreate
-    currency = Currency.getInstance(intlCurrencySymbol);
+        // recreate
+        currency = Currency.getInstance(intlCurrencySymbol);
+
+        // Refresh digitStrings in order to populate codePointZero
+        setDigitStrings(digitStrings);
      }
  
      /**
@@ -1569,6 +1619,22 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
       */
      private String digitStrings[];
  
+    /**
+     * Dealing with code points is faster than dealing with strings when formatting. Because of
+     * this, we maintain a value containing the zero code point that is used whenever digitStrings
+     * represents a sequence of ten code points in order.
+     *
+     * <p>If the value stored here is positive, it means that the code point stored in this value
+     * corresponds to the digitStrings array, and zeroCodePoint can be used instead of the
+     * digitStrings array for the purposes of efficient formatting; if -1, then digitStrings does
+     * *not* contain a sequence of code points, and it must be used directly.
+     *
+     * <p>It is assumed that zeroCodePoint always shadows the value in digitStrings. zeroCodePoint
+     * should never be set directly; rather, it should be updated only when digitStrings mutates.
+     * That is, the flow of information is digitStrings -> zeroCodePoint, not the other way.
+     */
+    private transient int codePointZero;
+
      /**
       * Character used for thousands separator.
       *
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java

index 56688a236b9d414d557e8dd6ae9dcc405802efb1..caf78abcbe91cfee271613802fab80762336e390 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java
@@ -269,6 +269,7 @@ public class IntlTestDecimalFormatSymbols extends com.ibm.icu.dev.test.TestFmwk
              "\uD801\uDCA0", "\uD801\uDCA1", "\uD801\uDCA2", "\uD801\uDCA3", "\uD801\uDCA4",
              "\uD801\uDCA5", "\uD801\uDCA6", "\uD801\uDCA7", "\uD801\uDCA8", "\uD801\uDCA9"
          };
+        final String[] differentDigitStrings = {"0", "b", "3", "d", "5", "f", "7", "h", "9", "j"};
  
          DecimalFormatSymbols symbols = new DecimalFormatSymbols(Locale.ENGLISH);
  
@@ -276,6 +277,9 @@ public class IntlTestDecimalFormatSymbols extends com.ibm.icu.dev.test.TestFmwk
          if (!Arrays.equals(symbols.getDigitStrings(), osmanyaDigitStrings)) {
              errln("ERROR: Osmanya digits (supplementary) should be set");
          }
+        if (Character.codePointAt(osmanyaDigitStrings[0], 0) != symbols.getCodePointZero()) {
+            errln("ERROR: Code point zero be Osmanya code point zero");
+        }
          if (defZero != symbols.getZeroDigit()) {
              errln("ERROR: Zero digit should be 0");
          }
@@ -283,10 +287,21 @@ public class IntlTestDecimalFormatSymbols extends com.ibm.icu.dev.test.TestFmwk
              errln("ERROR: Char digits should be Latin digits");
          }
  
+        symbols.setDigitStrings(differentDigitStrings);
+        if (!Arrays.equals(symbols.getDigitStrings(), differentDigitStrings)) {
+            errln("ERROR: Different digits should be set");
+        }
+        if (-1 != symbols.getCodePointZero()) {
+            errln("ERROR: Code point zero should be invalid");
+        }
+
          // Reset digits to Latin
          symbols.setZeroDigit(defZero);
          if (!Arrays.equals(symbols.getDigitStrings(), defDigitStrings)) {
              errln("ERROR: Latin digits should be set" + symbols.getDigitStrings()[0]);
          }
+        if (defZero != symbols.getCodePointZero()) {
+            errln("ERROR: Code point zero be ASCII 0");
+        }
      }
  }
author	Shane Carr <shane@unicode.org>
	Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)
committer	Shane Carr <shane@unicode.org>
	Sat, 29 Apr 2017 01:33:06 +0000 (01:33 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/number/formatters/PositiveDecimalFormat.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormatSymbols.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/IntlTestDecimalFormatSymbols.java		patch \| blob \| history