ICU-13574 Porting the parsing utility classes StringSegment and UnicodeSetStaticCache...

author Shane Carr <shane@unicode.org>

Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)

committer Shane Carr <shane@unicode.org>

Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)
author Shane Carr <shane@unicode.org>
Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)
committer Shane Carr <shane@unicode.org>
Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)
diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in

index dda6050af53e8f483ba7bdfa865276873bdb69da..2b9cca70556488b5cdbdb7aad17813f2a74e400a 100644 (file)
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -107,7 +107,8 @@ number_affixutils.o number_compact.o number_decimalquantity.o \
  number_decimfmtprops.o number_fluent.o number_formatimpl.o number_grouping.o \
  number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \
  number_padding.o number_patternmodifier.o number_patternstring.o \
-number_rounding.o number_scientific.o number_stringbuilder.o
+number_rounding.o number_scientific.o number_stringbuilder.o \
+numparse_stringsegment.o numparse_unisets.o
  
  
  ## Header files to install
diff --git a/icu4c/source/i18n/numparse_stringsegment.cpp b/icu4c/source/i18n/numparse_stringsegment.cpp

new file mode 100644 (file)

index 0000000..ecabab5
--- /dev/null
+++ b/icu4c/source/i18n/numparse_stringsegment.cpp
@@ -0,0 +1,79 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_types.h"
+#include "numparse_stringsegment.h"
+#include "putilimp.h"
+#include "unicode/utf16.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+
+
+StringSegment::StringSegment(const UnicodeString &str) : fStr(str), fStart(0), fEnd(str.length()) {}
+
+int32_t StringSegment::getOffset() const {
+    return fStart;
+}
+
+void StringSegment::setOffset(int32_t start) {
+    fStart = start;
+}
+
+void StringSegment::adjustOffset(int32_t delta) {
+    fStart += delta;
+}
+
+void StringSegment::setLength(int32_t length) {
+    fEnd = fStart + length;
+}
+
+void StringSegment::resetLength() {
+    fEnd = fStr.length();
+}
+
+int32_t StringSegment::length() const {
+    return fEnd - fStart;
+}
+
+char16_t StringSegment::charAt(int32_t index) const {
+    return fStr.charAt(index + fStart);
+}
+
+UChar32 StringSegment::codePointAt(int32_t index) const {
+    return fStr.char32At(index + fStart);
+}
+
+UnicodeString StringSegment::toUnicodeString() const {
+    return UnicodeString(fStr, fStart, fEnd - fStart);
+}
+
+UChar32 StringSegment::getCodePoint() const {
+    char16_t lead = fStr.charAt(fStart);
+    if (U16_IS_LEAD(lead) && fStart + 1 < fEnd) {
+        return fStr.char32At(fStart);
+    } else if (U16_IS_SURROGATE(lead)) {
+        return -1;
+    } else {
+        return lead;
+    }
+}
+
+int32_t StringSegment::getCommonPrefixLength(const UnicodeString &other) {
+    int32_t offset = 0;
+    for (; offset < uprv_min(length(), other.length());) {
+        if (charAt(offset) != other.charAt(offset)) {
+            break;
+        }
+        offset++;
+    }
+    return offset;
+}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_stringsegment.h b/icu4c/source/i18n/numparse_stringsegment.h

new file mode 100644 (file)

index 0000000..30f11af
--- /dev/null
+++ b/icu4c/source/i18n/numparse_stringsegment.h
@@ -0,0 +1,79 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_STRINGSEGMENT_H__
+#define __NUMPARSE_STRINGSEGMENT_H__
+
+#include "numparse_types.h"
+#include "number_types.h"
+#include "unicode/unistr.h"
+
+U_NAMESPACE_BEGIN
+namespace numparse {
+namespace impl {
+
+/**
+ * A mutable class allowing for a String with a variable offset and length. The charAt, length, and
+ * subSequence methods all operate relative to the fixed offset into the String.
+ *
+ * @author sffc
+ */
+class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
+  public:
+    explicit StringSegment(const UnicodeString &str);
+
+    int32_t getOffset() const;
+
+    void setOffset(int32_t start);
+
+    /**
+     * Equivalent to <code>setOffset(getOffset()+delta)</code>.
+     *
+     * <p>
+     * This method is usually called by a Matcher to register that a char was consumed. If the char is
+     * strong (it usually is, except for things like whitespace), follow this with a call to
+     * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method.
+     */
+    void adjustOffset(int32_t delta);
+
+    void setLength(int32_t length);
+
+    void resetLength();
+
+    int32_t length() const override;
+
+    char16_t charAt(int32_t index) const override;
+
+    UChar32 codePointAt(int32_t index) const override;
+
+    UnicodeString toUnicodeString() const override;
+
+    /**
+     * Returns the first code point in the string segment, or -1 if the string starts with an invalid
+     * code point.
+     */
+    UChar32 getCodePoint() const;
+
+    /**
+     * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
+     * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
+     * since the first 2 characters are the same.
+     */
+    int32_t getCommonPrefixLength(const UnicodeString &other);
+
+  private:
+    const UnicodeString fStr;
+    int32_t fStart;
+    int32_t fEnd;
+};
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_STRINGSEGMENT_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h

new file mode 100644 (file)

index 0000000..b607f36
--- /dev/null
+++ b/icu4c/source/i18n/numparse_types.h
@@ -0,0 +1,22 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_TYPES_H__
+#define __NUMPARSE_TYPES_H__
+
+#include "unicode/uobject.h"
+
+U_NAMESPACE_BEGIN
+namespace numparse {
+namespace impl {
+
+
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_TYPES_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp

new file mode 100644 (file)

index 0000000..8477870
--- /dev/null
+++ b/icu4c/source/i18n/numparse_unisets.cpp
@@ -0,0 +1,124 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numparse_unisets.h"
+#include "numparse_types.h"
+#include "umutex.h"
+#include "ucln_in.h"
+#include "unicode/uniset.h"
+
+using namespace icu;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
+using namespace icu::numparse::impl::unisets;
+
+
+namespace {
+
+UnicodeSet* gUnicodeSets[COUNT] = {};
+
+UnicodeSet* computeUnion(Key k1, Key k2) {
+    UnicodeSet* result = new UnicodeSet();
+    if (result == nullptr) {
+        return nullptr;
+    }
+    result->addAll(*gUnicodeSets[k1]);
+    result->addAll(*gUnicodeSets[k2]);
+    result->freeze();
+    return result;
+}
+
+UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
+    UnicodeSet* result = new UnicodeSet();
+    if (result == nullptr) {
+        return nullptr;
+    }
+    result->addAll(*gUnicodeSets[k1]);
+    result->addAll(*gUnicodeSets[k2]);
+    result->addAll(*gUnicodeSets[k3]);
+    result->freeze();
+    return result;
+}
+
+icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
+
+UBool U_CALLCONV cleanupNumberParseUnitSets() {
+    for (int32_t i = 0; i < COUNT; i++) {
+        delete gUnicodeSets[i];
+        gUnicodeSets[i] = nullptr;
+    }
+    return TRUE;
+}
+
+void U_CALLCONV initNumberParseUniSets(UErrorCode &status) {
+    ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUnitSets);
+#define NEW_UNISET(pattern, status) new UnicodeSet(UnicodeString(pattern), status)
+
+    // BiDi characters are skipped over and ignored at any point in the string, even in strict mode.
+    gUnicodeSets[BIDI] = NEW_UNISET(u"[[\\u200E\\u200F\\u061C]]", status);
+
+    // This set was decided after discussion with icu-design@. See ticket #13309.
+    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
+    gUnicodeSets[WHITESPACE] = NEW_UNISET(u"[[:Zs:][\\u0009]]", status);
+
+    gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE);
+    gUnicodeSets[STRICT_IGNORABLES] = gUnicodeSets[BIDI];
+
+    // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
+    gUnicodeSets[COMMA] = NEW_UNISET(u"[,،٫、︐︑﹐﹑，､]", status);
+    gUnicodeSets[STRICT_COMMA] = NEW_UNISET(u"[,٫︐﹐，]", status);
+    gUnicodeSets[PERIOD] = NEW_UNISET(u"[.․。︒﹒．｡]", status);
+    gUnicodeSets[STRICT_PERIOD] = NEW_UNISET(u"[.․﹒．｡]", status);
+    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = NEW_UNISET(
+            u"['٬‘’＇\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
+    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
+    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
+            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
+
+    gUnicodeSets[MINUS_SIGN] = NEW_UNISET(u"[-⁻₋−➖﹣－]", status);
+    gUnicodeSets[PLUS_SIGN] = NEW_UNISET(u"[+⁺₊➕﬩﹢＋]", status);
+
+    gUnicodeSets[PERCENT_SIGN] = NEW_UNISET(u"[%٪]", status);
+    gUnicodeSets[PERMILLE_SIGN] = NEW_UNISET(u"[‰؉]", status);
+    gUnicodeSets[INFINITY] = NEW_UNISET(u"[∞]", status);
+
+    gUnicodeSets[DIGITS] = NEW_UNISET(u"[:digit:]", status);
+    gUnicodeSets[NAN_LEAD] = NEW_UNISET(
+            u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status);
+    gUnicodeSets[SCIENTIFIC_LEAD] = NEW_UNISET(u"[Ee×·е\u0627]", status);
+    gUnicodeSets[CWCF] = NEW_UNISET(u"[:CWCF:]", status);
+
+    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
+    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
+
+    for (int32_t i = 0; i < COUNT; i++) {
+        gUnicodeSets[i]->freeze();
+    }
+}
+
+}
+
+const UnicodeSet* unisets::get(Key key) {
+    UErrorCode localStatus = U_ZERO_ERROR;
+    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
+    if (U_FAILURE(localStatus)) {
+        // TODO: This returns non-null in Java, and callers assume that.
+        return nullptr;
+    }
+    return gUnicodeSets[key];
+}
+
+Key unisets::chooseFrom(UnicodeString str, Key key1) {
+    return get(key1)->contains(str) ? key1 : COUNT;
+}
+
+Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
+    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
+}
+
+
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/numparse_unisets.h b/icu4c/source/i18n/numparse_unisets.h

new file mode 100644 (file)

index 0000000..1d92361
--- /dev/null
+++ b/icu4c/source/i18n/numparse_unisets.h
@@ -0,0 +1,72 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+#ifndef __NUMPARSE_UNISETS_H__
+#define __NUMPARSE_UNISETS_H__
+
+#include "numparse_types.h"
+#include "unicode/uniset.h"
+
+U_NAMESPACE_BEGIN namespace numparse {
+namespace impl {
+namespace unisets {
+
+enum Key {
+    // Ignorables
+            BIDI,
+    WHITESPACE,
+    DEFAULT_IGNORABLES,
+    STRICT_IGNORABLES,
+
+    // Separators
+    // Notes:
+    // - COMMA is a superset of STRICT_COMMA
+    // - PERIOD is a superset of SCRICT_PERIOD
+    // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
+    // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
+            COMMA,
+    PERIOD,
+    STRICT_COMMA,
+    STRICT_PERIOD,
+    OTHER_GROUPING_SEPARATORS,
+    ALL_SEPARATORS,
+    STRICT_ALL_SEPARATORS,
+
+    // Symbols
+    // TODO: NaN?
+            MINUS_SIGN,
+    PLUS_SIGN,
+    PERCENT_SIGN,
+    PERMILLE_SIGN,
+    INFINITY,
+
+    // Other
+            DIGITS,
+    NAN_LEAD,
+    SCIENTIFIC_LEAD,
+    CWCF,
+
+    // Combined Separators with Digits (for lead code points)
+            DIGITS_OR_ALL_SEPARATORS,
+    DIGITS_OR_STRICT_ALL_SEPARATORS,
+
+    // The number of elements in the enum.  Also used to indicate null.
+            COUNT
+};
+
+const UnicodeSet* get(Key key);
+
+Key chooseFrom(UnicodeString str, Key key1);
+
+Key chooseFrom(UnicodeString str, Key key1, Key key2);
+
+} // namespace unisets
+} // namespace impl
+} // namespace numparse
+U_NAMESPACE_END
+
+#endif //__NUMPARSE_UNISETS_H__
+#endif /* #if !UCONFIG_NO_FORMATTING */
diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h

index 40a5c36d87a9f71fd0e05d56c7dd3ffa8e1a6376..d9e8741e7f296ea12974a16162db038fdbe625a6 100644 (file)
--- a/icu4c/source/i18n/ucln_in.h
+++ b/icu4c/source/i18n/ucln_in.h
@@ -26,6 +26,7 @@ as the functions are suppose to be called.
  It's usually best to have child dependencies called first. */
  typedef enum ECleanupI18NType {
      UCLN_I18N_START = -1,
+    UCLN_I18N_NUMPARSE_UNISETS,
      UCLN_I18N_CURRENCY_SPACING,
      UCLN_I18N_SPOOF,
      UCLN_I18N_SPOOFDATA,
diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in

index d41ef25f52159b73d9faac5f693994bd64a20625..ed1aa256b1499e010bc1a17e8be9e6c96dc5debe 100644 (file)
--- a/icu4c/source/test/intltest/Makefile.in
+++ b/icu4c/source/test/intltest/Makefile.in
@@ -64,7 +64,7 @@ scientificnumberformattertest.o datadrivennumberformattestsuite.o \
  numberformattesttuple.o numberformat2test.o pluralmaptest.o \
  numbertest_affixutils.o numbertest_api.o numbertest_decimalquantity.o \
  numbertest_modifiers.o numbertest_patternmodifier.o numbertest_patternstring.o \
-numbertest_stringbuilder.o
+numbertest_stringbuilder.o numbertest_stringsegment.o numbertest_unisets.o
  
  DEPS = $(OBJECTS:.o=.d)
  
diff --git a/icu4c/source/test/intltest/numbertest.h b/icu4c/source/test/intltest/numbertest.h

index 9d4ffb7cef0d2afbc832b536f2785fad1610fe8b..60743ed5a1a0c90b87fcce1fcfdcff8b667e0ae2 100644 (file)
--- a/icu4c/source/test/intltest/numbertest.h
+++ b/icu4c/source/test/intltest/numbertest.h
@@ -9,9 +9,13 @@
  #include "number_stringbuilder.h"
  #include "intltest.h"
  #include "number_affixutils.h"
+#include "numparse_stringsegment.h"
+#include "unicode/locid.h"
  
  using namespace icu::number;
  using namespace icu::number::impl;
+using namespace icu::numparse;
+using namespace icu::numparse::impl;
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // INSTRUCTIONS:                                                                      //
@@ -178,6 +182,30 @@ class NumberStringBuilderTest : public IntlTest {
      void assertEqualsImpl(const UnicodeString &a, const NumberStringBuilder &b);
  };
  
+class StringSegmentTest : public IntlTest {
+  public:
+    void testOffset();
+    void testLength();
+    void testCharAt();
+    void testGetCodePoint();
+    void testCommonPrefixLength();
+
+    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0);
+};
+
+class UniSetsTest : public IntlTest {
+  public:
+    void testSetCoverage();
+
+    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0);
+
+  private:
+    void assertInSet(const UnicodeString& localeName, const UnicodeString &setName,
+                     const UnicodeSet& set, const UnicodeString& str);
+    void assertInSet(const UnicodeString& localeName, const UnicodeString &setName,
+                     const UnicodeSet& set, UChar32 cp);
+};
+
  
  // NOTE: This macro is identical to the one in itformat.cpp
  #define TESTCLASS(id, TestClass)          \
@@ -206,6 +234,8 @@ class NumberTest : public IntlTest {
          TESTCLASS(4, PatternModifierTest);
          TESTCLASS(5, PatternStringTest);
          TESTCLASS(6, NumberStringBuilderTest);
+        TESTCLASS(7, StringSegmentTest);
+        TESTCLASS(8, UniSetsTest);
          default: name = ""; break; // needed to end loop
          }
      }
diff --git a/icu4c/source/test/intltest/numbertest_stringsegment.cpp b/icu4c/source/test/intltest/numbertest_stringsegment.cpp

new file mode 100644 (file)

index 0000000..519642e
--- /dev/null
+++ b/icu4c/source/test/intltest/numbertest_stringsegment.cpp
@@ -0,0 +1,94 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numbertest.h"
+#include "numparse_stringsegment.h"
+
+static const char16_t* SAMPLE_STRING = u"📻 radio 📻";
+
+void StringSegmentTest::runIndexedTest(int32_t index, UBool exec, const char*&name, char*) {
+    if (exec) {
+        logln("TestSuite StringSegmentTest: ");
+    }
+    TESTCASE_AUTO_BEGIN;
+        TESTCASE_AUTO(testOffset);
+        TESTCASE_AUTO(testLength);
+        TESTCASE_AUTO(testCharAt);
+        TESTCASE_AUTO(testGetCodePoint);
+        TESTCASE_AUTO(testCommonPrefixLength);
+    TESTCASE_AUTO_END;
+}
+
+void StringSegmentTest::testOffset() {
+    StringSegment segment(SAMPLE_STRING);
+    assertEquals("Initial Offset", 0, segment.getOffset());
+    segment.adjustOffset(3);
+    assertEquals("Adjust A", 3, segment.getOffset());
+    segment.adjustOffset(2);
+    assertEquals("Adjust B", 5, segment.getOffset());
+    segment.setOffset(4);
+    assertEquals("Set Offset", 4, segment.getOffset());
+}
+
+void StringSegmentTest::testLength() {
+    StringSegment segment(SAMPLE_STRING);
+    assertEquals("Initial length", 11, segment.length());
+    segment.adjustOffset(3);
+    assertEquals("Adjust", 8, segment.length());
+    segment.setLength(4);
+    assertEquals("Set Length", 4, segment.length());
+    segment.setOffset(5);
+    assertEquals("After adjust offset", 2, segment.length());
+    segment.resetLength();
+    assertEquals("After reset length", 6, segment.length());
+}
+
+void StringSegmentTest::testCharAt() {
+    StringSegment segment(SAMPLE_STRING);
+    assertEquals("Initial", SAMPLE_STRING, segment.toUnicodeString());
+    segment.adjustOffset(3);
+    assertEquals("After adjust-offset", UnicodeString(u"radio 📻"), segment.toUnicodeString());
+    segment.setLength(5);
+    assertEquals("After adjust-length", UnicodeString(u"radio"), segment.toUnicodeString());
+}
+
+void StringSegmentTest::testGetCodePoint() {
+    StringSegment segment(SAMPLE_STRING);
+    assertEquals("Double-width code point", 0x1F4FB, segment.getCodePoint());
+    segment.setLength(1);
+    assertEquals("Inalid A", -1, segment.getCodePoint());
+    segment.resetLength();
+    segment.adjustOffset(1);
+    assertEquals("Invalid B", -1, segment.getCodePoint());
+    segment.adjustOffset(1);
+    assertEquals("Valid again", 0x20, segment.getCodePoint());
+}
+
+void StringSegmentTest::testCommonPrefixLength() {
+    StringSegment segment(SAMPLE_STRING);
+    assertEquals("", 11, segment.getCommonPrefixLength(SAMPLE_STRING));
+    assertEquals("", 4, segment.getCommonPrefixLength(u"📻 r"));
+    assertEquals("", 3, segment.getCommonPrefixLength(u"📻 x"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u"x"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u""));
+    segment.adjustOffset(3);
+    assertEquals("", 0, segment.getCommonPrefixLength(u"RADiO"));
+    assertEquals("", 5, segment.getCommonPrefixLength(u"radio"));
+    assertEquals("", 2, segment.getCommonPrefixLength(u"rafio"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u"fadio"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u""));
+    segment.setLength(3);
+    assertEquals("", 3, segment.getCommonPrefixLength(u"radio"));
+    assertEquals("", 2, segment.getCommonPrefixLength(u"rafio"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u"fadio"));
+    assertEquals("", 0, segment.getCommonPrefixLength(u""));
+    segment.resetLength();
+    segment.setOffset(11); // end of string
+    assertEquals("", 0, segment.getCommonPrefixLength(u"foo"));
+}
+
+#endif
diff --git a/icu4c/source/test/intltest/numbertest_unisets.cpp b/icu4c/source/test/intltest/numbertest_unisets.cpp

new file mode 100644 (file)

index 0000000..a41f3f6
--- /dev/null
+++ b/icu4c/source/test/intltest/numbertest_unisets.cpp
@@ -0,0 +1,99 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
+
+#include "numbertest.h"
+#include "numparse_unisets.h"
+#include "unicode/dcfmtsym.h"
+
+#include <iostream>
+#include <cstr.h>
+
+using icu::numparse::impl::unisets::get;
+
+void UniSetsTest::runIndexedTest(int32_t index, UBool exec, const char*&name, char*) {
+    if (exec) {
+        logln("TestSuite UniSetsTest: ");
+    }
+    TESTCASE_AUTO_BEGIN;
+        TESTCASE_AUTO(testSetCoverage);
+    TESTCASE_AUTO_END;
+}
+
+void UniSetsTest::testSetCoverage() {
+    UErrorCode status = U_ZERO_ERROR;
+
+    // Lenient comma/period should be supersets of strict comma/period;
+    // it also makes the coverage logic cheaper.
+    assertTrue(
+            "COMMA should be superset of STRICT_COMMA",
+            get(unisets::COMMA)->containsAll(*get(unisets::STRICT_COMMA)));
+    assertTrue(
+            "PERIOD should be superset of STRICT_PERIOD",
+            get(unisets::PERIOD)->containsAll(*get(unisets::STRICT_PERIOD)));
+
+    UnicodeSet decimals;
+    decimals.addAll(*get(unisets::STRICT_COMMA));
+    decimals.addAll(*get(unisets::STRICT_PERIOD));
+    decimals.freeze();
+    UnicodeSet grouping;
+    grouping.addAll(decimals);
+    grouping.addAll(*get(unisets::OTHER_GROUPING_SEPARATORS));
+    decimals.freeze();
+
+    const UnicodeSet &plusSign = *get(unisets::PLUS_SIGN);
+    const UnicodeSet &minusSign = *get(unisets::MINUS_SIGN);
+    const UnicodeSet &percent = *get(unisets::PERCENT_SIGN);
+    const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN);
+    const UnicodeSet &infinity = *get(unisets::INFINITY);
+    const UnicodeSet &nanLead = *get(unisets::NAN_LEAD);
+    const UnicodeSet &scientificLead = *get(unisets::SCIENTIFIC_LEAD);
+
+    int32_t localeCount;
+    const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount);
+    for (int32_t i = 0; i < localeCount; i++) {
+        Locale locale = allAvailableLocales[i];
+        DecimalFormatSymbols dfs(locale, status);
+        UnicodeString localeName;
+        locale.getDisplayName(localeName);
+        assertSuccess(UnicodeString("Making DFS for ") + localeName, status);
+
+#define ASSERT_IN_SET(name, foo) assertInSet(localeName, UnicodeString("" #name ""), name, foo)
+        ASSERT_IN_SET(decimals, dfs.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol));
+        ASSERT_IN_SET(grouping, dfs.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol));
+        ASSERT_IN_SET(plusSign, dfs.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol));
+        ASSERT_IN_SET(minusSign, dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol));
+        ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol));
+        ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol));
+        ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol));
+        ASSERT_IN_SET(nanLead, dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0));
+        ASSERT_IN_SET(nanLead,
+                u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0), 0));
+        ASSERT_IN_SET(scientificLead,
+                u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol).char32At(0), 0));
+    }
+}
+
+void UniSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName,
+                              const UnicodeSet &set, const UnicodeString &str) {
+    if (str.countChar32(0, str.length()) != 1) {
+        // Ignore locale strings with more than one code point (usually a bidi mark)
+        return;
+    }
+    assertInSet(localeName, setName, set, str.char32At(0));
+}
+
+void UniSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName,
+                              const UnicodeSet &set, UChar32 cp) {
+    // If this test case fails, add the specified code point to the corresponding set in
+    // UnicodeSetStaticCache.java and numparse_unisets.cpp
+    assertTrue(
+            localeName + UnicodeString(u" ") + UnicodeString(cp) + UnicodeString(u" is missing in ") +
+            setName, set.contains(cp));
+}
+
+
+#endif
author	Shane Carr <shane@unicode.org>
	Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)
committer	Shane Carr <shane@unicode.org>
	Tue, 6 Feb 2018 07:52:58 +0000 (07:52 +0000)
icu4c/source/i18n/Makefile.in		patch \| blob \| history
icu4c/source/i18n/numparse_stringsegment.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/numparse_stringsegment.h	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/numparse_types.h	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/numparse_unisets.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/numparse_unisets.h	[new file with mode: 0644]	patch \| blob
icu4c/source/i18n/ucln_in.h		patch \| blob \| history
icu4c/source/test/intltest/Makefile.in		patch \| blob \| history
icu4c/source/test/intltest/numbertest.h		patch \| blob \| history
icu4c/source/test/intltest/numbertest_stringsegment.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/test/intltest/numbertest_unisets.cpp	[new file with mode: 0644]	patch \| blob