ICU-13574 Adding codePointZero logic to ICU4C, added to ICU4J in r40091.

author Shane Carr <shane@unicode.org>

Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)

committer Shane Carr <shane@unicode.org>

Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)
author Shane Carr <shane@unicode.org>
Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)
committer Shane Carr <shane@unicode.org>
Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)
diff --git a/icu4c/source/i18n/dcfmtsym.cpp b/icu4c/source/i18n/dcfmtsym.cpp

index d321a82f8a9550f49eff66b1dbd5a01dbbd2e915..6651d8a89a2fa4ab04b019508a89c3b0bc4abf23 100644 (file)
--- a/icu4c/source/i18n/dcfmtsym.cpp
+++ b/icu4c/source/i18n/dcfmtsym.cpp
@@ -38,6 +38,7 @@
  #include "uresimp.h"
  #include "ureslocs.h"
  #include "charstr.h"
+#include "uassert.h"
  
  // *****************************************************************************
  // class DecimalFormatSymbols
@@ -530,6 +531,8 @@ DecimalFormatSymbols::initialize() {
      fSymbols[kExponentMultiplicationSymbol] = (UChar)0xd7; // 'x' multiplication symbol for exponents
      fIsCustomCurrencySymbol = FALSE; 
      fIsCustomIntlCurrencySymbol = FALSE;
+    fCodePointZero = 0x30;
+    U_ASSERT(fCodePointZero == fSymbols[kZeroDigitSymbol].char32At(0));
  
  }
  
diff --git a/icu4c/source/i18n/unicode/dcfmtsym.h b/icu4c/source/i18n/unicode/dcfmtsym.h

index 0308f2c2fd1cbe4c5cb59dd76ab58a9ac2549fde..c6da623034cb616d9b19e2da02aabf0c04aafb72 100644 (file)
--- a/icu4c/source/i18n/unicode/dcfmtsym.h
+++ b/icu4c/source/i18n/unicode/dcfmtsym.h
@@ -392,6 +392,13 @@ public:
      inline UBool isCustomIntlCurrencySymbol() const {
          return fIsCustomIntlCurrencySymbol;
      }
+
+    /**
+     * @internal For ICU use only
+     */
+    inline UChar32 getCodePointZero() const {
+        return fCodePointZero;
+    }
  #endif  /* U_HIDE_INTERNAL_API */
  
      /**
@@ -440,6 +447,22 @@ private:
       */
      UnicodeString fNoSymbol;
  
+    /**
+     * Dealing with code points is faster than dealing with strings when formatting. Because of
+     * this, we maintain a value containing the zero code point that is used whenever digitStrings
+     * represents a sequence of ten code points in order.
+     *
+     * <p>If the value stored here is positive, it means that the code point stored in this value
+     * corresponds to the digitStrings array, and codePointZero can be used instead of the
+     * digitStrings array for the purposes of efficient formatting; if -1, then digitStrings does
+     * *not* contain a sequence of code points, and it must be used directly.
+     *
+     * <p>It is assumed that codePointZero always shadows the value in digitStrings. codePointZero
+     * should never be set directly; rather, it should be updated only when digitStrings mutates.
+     * That is, the flow of information is digitStrings -> codePointZero, not the other way.
+     */
+    UChar32 fCodePointZero;
+
      Locale locale;
  
      char actualLocale[ULOC_FULLNAME_CAPACITY];
@@ -493,13 +516,17 @@ DecimalFormatSymbols::setSymbol(ENumberFormatSymbol symbol, const UnicodeString
  
      // If the zero digit is being set to a known zero digit according to Unicode,
      // then we automatically set the corresponding 1-9 digits
-    if ( propogateDigits && symbol == kZeroDigitSymbol && value.countChar32() == 1 ) {
+    // Also record updates to fCodePointZero. Be conservative if in doubt.
+    if (symbol == kZeroDigitSymbol) {
          UChar32 sym = value.char32At(0);
-        if ( u_charDigitValue(sym) == 0 ) {
+        if ( propogateDigits && u_charDigitValue(sym) == 0 && value.countChar32() == 1 ) {
+            fCodePointZero = sym;
              for ( int8_t i = 1 ; i<= 9 ; i++ ) {
                  sym++;
                  fSymbols[(int)kOneDigitSymbol+i-1] = UnicodeString(sym);
              }
+        } else {
+            fCodePointZero = -1;
          }
      }
  }
diff --git a/icu4c/source/test/intltest/tsdcfmsy.cpp b/icu4c/source/test/intltest/tsdcfmsy.cpp

index 90198e070f4f513f4191029e08f12eaed581eac1..0cbd784ec873055427075d0938d12dafeb6febcb 100644 (file)
--- a/icu4c/source/test/intltest/tsdcfmsy.cpp
+++ b/icu4c/source/test/intltest/tsdcfmsy.cpp
@@ -23,6 +23,7 @@ void IntlTestDecimalFormatSymbols::runIndexedTest( int32_t index, UBool exec, co
      TESTCASE_AUTO_BEGIN;
      TESTCASE_AUTO(testSymbols);
      TESTCASE_AUTO(testLastResortData);
+    TESTCASE_AUTO(testDigitSymbols);
      TESTCASE_AUTO(testNumberingSystem);
      TESTCASE_AUTO_END;
  }
@@ -249,6 +250,54 @@ void IntlTestDecimalFormatSymbols::testLastResortData() {
      Verify(1234567.25, "#,##0.##", *lastResort, "1,234,567.25");
  }
  
+void IntlTestDecimalFormatSymbols::testDigitSymbols() {
+    // This test does more in ICU4J than in ICU4C right now.
+    // In ICU4C, it is basically just a test for codePointZero.
+    UChar defZero = u'0';
+    UChar32 osmanyaZero = U'\U000104A0';
+    static const UChar* osmanyaDigitStrings[] = {
+        u"\U000104A0", u"\U000104A1", u"\U000104A2", u"\U000104A3", u"\U000104A4",
+        u"\U000104A5", u"\U000104A6", u"\U000104A7", u"\U000104A8", u"\U000104A9"
+    };
+
+    IcuTestErrorCode status(*this, "testDigitSymbols()");
+    DecimalFormatSymbols symbols(Locale("en"), status);
+
+    if (defZero != symbols.getCodePointZero()) {
+        errln("ERROR: Code point zero be ASCII 0");
+    }
+
+    for (int32_t i=0; i<=9; i++) {
+        DecimalFormatSymbols::ENumberFormatSymbol key =
+            i == 0
+            ? DecimalFormatSymbols::kZeroDigitSymbol
+            : static_cast<DecimalFormatSymbols::ENumberFormatSymbol>
+                (DecimalFormatSymbols::kOneDigitSymbol + i);
+        symbols.setSymbol(key, UnicodeString(osmanyaDigitStrings[i]), FALSE);
+    }
+    // NOTE: in ICU4J, the calculation of codePointZero is smarter;
+    // in ICU4C, it is more conservative and is only set if propogateDigits is true.
+    if (-1 != symbols.getCodePointZero()) {
+        errln("ERROR: Code point zero be invalid");
+    }
+
+    // Check Osmanya codePointZero
+    symbols.setSymbol(
+        DecimalFormatSymbols::kZeroDigitSymbol,
+        UnicodeString(osmanyaDigitStrings[0]), TRUE);
+    if (osmanyaZero != symbols.getCodePointZero()) {
+        errln("ERROR: Code point zero be Osmanya code point zero");
+    }
+
+    // Reset digits to Latin
+    symbols.setSymbol(
+        DecimalFormatSymbols::kZeroDigitSymbol,
+        UnicodeString(defZero));
+    if (defZero != symbols.getCodePointZero()) {
+        errln("ERROR: Code point zero be ASCII 0");
+    }
+}
+
  void IntlTestDecimalFormatSymbols::testNumberingSystem() {
      IcuTestErrorCode errorCode(*this, "testNumberingSystem");
      struct testcase {
diff --git a/icu4c/source/test/intltest/tsdcfmsy.h b/icu4c/source/test/intltest/tsdcfmsy.h

index 1fd1dfdfba3dc357ecdffe6287d4c5a33bb0826d..1922941b847d65205fb0de9683d660a38b05f53d 100644 (file)
--- a/icu4c/source/test/intltest/tsdcfmsy.h
+++ b/icu4c/source/test/intltest/tsdcfmsy.h
@@ -28,6 +28,7 @@ private:
       */
      void testSymbols(/*char *par*/);
      void testLastResortData();
+    void testDigitSymbols();
      void testNumberingSystem();
  
       /** helper functions**/
author	Shane Carr <shane@unicode.org>
	Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)
committer	Shane Carr <shane@unicode.org>
	Thu, 8 Feb 2018 01:51:09 +0000 (01:51 +0000)
icu4c/source/i18n/dcfmtsym.cpp		patch \| blob \| history
icu4c/source/i18n/unicode/dcfmtsym.h		patch \| blob \| history
icu4c/source/test/intltest/tsdcfmsy.cpp		patch \| blob \| history
icu4c/source/test/intltest/tsdcfmsy.h		patch \| blob \| history