From: Shane Carr Date: Tue, 6 Feb 2018 07:52:58 +0000 (+0000) Subject: ICU-13574 Porting the parsing utility classes StringSegment and UnicodeSetStaticCache... X-Git-Tag: release-62-rc~200^2~142^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=237acf183a1c5842103487b1e787e5a0f21a8a9e;p=icu ICU-13574 Porting the parsing utility classes StringSegment and UnicodeSetStaticCache to C++. X-SVN-Rev: 40841 --- diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index dda6050af53..2b9cca70556 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -107,7 +107,8 @@ number_affixutils.o number_compact.o number_decimalquantity.o \ number_decimfmtprops.o number_fluent.o number_formatimpl.o number_grouping.o \ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \ number_padding.o number_patternmodifier.o number_patternstring.o \ -number_rounding.o number_scientific.o number_stringbuilder.o +number_rounding.o number_scientific.o number_stringbuilder.o \ +numparse_stringsegment.o numparse_unisets.o ## Header files to install diff --git a/icu4c/source/i18n/numparse_stringsegment.cpp b/icu4c/source/i18n/numparse_stringsegment.cpp new file mode 100644 index 00000000000..ecabab5faa8 --- /dev/null +++ b/icu4c/source/i18n/numparse_stringsegment.cpp @@ -0,0 +1,79 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numparse_types.h" +#include "numparse_stringsegment.h" +#include "putilimp.h" +#include "unicode/utf16.h" + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; + + +StringSegment::StringSegment(const UnicodeString &str) : fStr(str), fStart(0), fEnd(str.length()) {} + +int32_t StringSegment::getOffset() const { + return fStart; +} + +void StringSegment::setOffset(int32_t start) { + fStart = start; +} + +void StringSegment::adjustOffset(int32_t delta) { + fStart += delta; +} + +void StringSegment::setLength(int32_t length) { + fEnd = fStart + length; +} + +void StringSegment::resetLength() { + fEnd = fStr.length(); +} + +int32_t StringSegment::length() const { + return fEnd - fStart; +} + +char16_t StringSegment::charAt(int32_t index) const { + return fStr.charAt(index + fStart); +} + +UChar32 StringSegment::codePointAt(int32_t index) const { + return fStr.char32At(index + fStart); +} + +UnicodeString StringSegment::toUnicodeString() const { + return UnicodeString(fStr, fStart, fEnd - fStart); +} + +UChar32 StringSegment::getCodePoint() const { + char16_t lead = fStr.charAt(fStart); + if (U16_IS_LEAD(lead) && fStart + 1 < fEnd) { + return fStr.char32At(fStart); + } else if (U16_IS_SURROGATE(lead)) { + return -1; + } else { + return lead; + } +} + +int32_t StringSegment::getCommonPrefixLength(const UnicodeString &other) { + int32_t offset = 0; + for (; offset < uprv_min(length(), other.length());) { + if (charAt(offset) != other.charAt(offset)) { + break; + } + offset++; + } + return offset; +} + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_stringsegment.h b/icu4c/source/i18n/numparse_stringsegment.h new file mode 100644 index 00000000000..30f11af7a19 --- /dev/null +++ b/icu4c/source/i18n/numparse_stringsegment.h @@ -0,0 +1,79 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +#ifndef __NUMPARSE_STRINGSEGMENT_H__ +#define __NUMPARSE_STRINGSEGMENT_H__ + +#include "numparse_types.h" +#include "number_types.h" +#include "unicode/unistr.h" + +U_NAMESPACE_BEGIN +namespace numparse { +namespace impl { + +/** + * A mutable class allowing for a String with a variable offset and length. The charAt, length, and + * subSequence methods all operate relative to the fixed offset into the String. + * + * @author sffc + */ +class StringSegment : public UMemory, public ::icu::number::impl::CharSequence { + public: + explicit StringSegment(const UnicodeString &str); + + int32_t getOffset() const; + + void setOffset(int32_t start); + + /** + * Equivalent to setOffset(getOffset()+delta). + * + *

+ * This method is usually called by a Matcher to register that a char was consumed. If the char is + * strong (it usually is, except for things like whitespace), follow this with a call to + * {@link ParsedNumber#setCharsConsumed}. For more information on strong chars, see that method. + */ + void adjustOffset(int32_t delta); + + void setLength(int32_t length); + + void resetLength(); + + int32_t length() const override; + + char16_t charAt(int32_t index) const override; + + UChar32 codePointAt(int32_t index) const override; + + UnicodeString toUnicodeString() const override; + + /** + * Returns the first code point in the string segment, or -1 if the string starts with an invalid + * code point. + */ + UChar32 getCodePoint() const; + + /** + * Returns the length of the prefix shared by this StringSegment and the given CharSequence. For + * example, if this string segment is "aab", and the char sequence is "aac", this method returns 2, + * since the first 2 characters are the same. + */ + int32_t getCommonPrefixLength(const UnicodeString &other); + + private: + const UnicodeString fStr; + int32_t fStart; + int32_t fEnd; +}; + + +} // namespace impl +} // namespace numparse +U_NAMESPACE_END + +#endif //__NUMPARSE_STRINGSEGMENT_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h new file mode 100644 index 00000000000..b607f36cc99 --- /dev/null +++ b/icu4c/source/i18n/numparse_types.h @@ -0,0 +1,22 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +#ifndef __NUMPARSE_TYPES_H__ +#define __NUMPARSE_TYPES_H__ + +#include "unicode/uobject.h" + +U_NAMESPACE_BEGIN +namespace numparse { +namespace impl { + + +} // namespace impl +} // namespace numparse +U_NAMESPACE_END + +#endif //__NUMPARSE_TYPES_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp new file mode 100644 index 00000000000..8477870e29d --- /dev/null +++ b/icu4c/source/i18n/numparse_unisets.cpp @@ -0,0 +1,124 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numparse_unisets.h" +#include "numparse_types.h" +#include "umutex.h" +#include "ucln_in.h" +#include "unicode/uniset.h" + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; +using namespace icu::numparse::impl::unisets; + + +namespace { + +UnicodeSet* gUnicodeSets[COUNT] = {}; + +UnicodeSet* computeUnion(Key k1, Key k2) { + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + return nullptr; + } + result->addAll(*gUnicodeSets[k1]); + result->addAll(*gUnicodeSets[k2]); + result->freeze(); + return result; +} + +UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + return nullptr; + } + result->addAll(*gUnicodeSets[k1]); + result->addAll(*gUnicodeSets[k2]); + result->addAll(*gUnicodeSets[k3]); + result->freeze(); + return result; +} + +icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER; + +UBool U_CALLCONV cleanupNumberParseUnitSets() { + for (int32_t i = 0; i < COUNT; i++) { + delete gUnicodeSets[i]; + gUnicodeSets[i] = nullptr; + } + return TRUE; +} + +void U_CALLCONV initNumberParseUniSets(UErrorCode &status) { + ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUnitSets); +#define NEW_UNISET(pattern, status) new UnicodeSet(UnicodeString(pattern), status) + + // BiDi characters are skipped over and ignored at any point in the string, even in strict mode. + gUnicodeSets[BIDI] = NEW_UNISET(u"[[\\u200E\\u200F\\u061C]]", status); + + // This set was decided after discussion with icu-design@. See ticket #13309. + // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). + gUnicodeSets[WHITESPACE] = NEW_UNISET(u"[[:Zs:][\\u0009]]", status); + + gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE); + gUnicodeSets[STRICT_IGNORABLES] = gUnicodeSets[BIDI]; + + // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while. + gUnicodeSets[COMMA] = NEW_UNISET(u"[,،٫、︐︑﹐﹑,、]", status); + gUnicodeSets[STRICT_COMMA] = NEW_UNISET(u"[,٫︐﹐,]", status); + gUnicodeSets[PERIOD] = NEW_UNISET(u"[.․。︒﹒.。]", status); + gUnicodeSets[STRICT_PERIOD] = NEW_UNISET(u"[.․﹒.。]", status); + gUnicodeSets[OTHER_GROUPING_SEPARATORS] = NEW_UNISET( + u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status); + gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); + gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( + STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); + + gUnicodeSets[MINUS_SIGN] = NEW_UNISET(u"[-⁻₋−➖﹣-]", status); + gUnicodeSets[PLUS_SIGN] = NEW_UNISET(u"[+⁺₊➕﬩﹢+]", status); + + gUnicodeSets[PERCENT_SIGN] = NEW_UNISET(u"[%٪]", status); + gUnicodeSets[PERMILLE_SIGN] = NEW_UNISET(u"[‰؉]", status); + gUnicodeSets[INFINITY] = NEW_UNISET(u"[∞]", status); + + gUnicodeSets[DIGITS] = NEW_UNISET(u"[:digit:]", status); + gUnicodeSets[NAN_LEAD] = NEW_UNISET( + u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status); + gUnicodeSets[SCIENTIFIC_LEAD] = NEW_UNISET(u"[Ee×·е\u0627]", status); + gUnicodeSets[CWCF] = NEW_UNISET(u"[:CWCF:]", status); + + gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); + gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); + + for (int32_t i = 0; i < COUNT; i++) { + gUnicodeSets[i]->freeze(); + } +} + +} + +const UnicodeSet* unisets::get(Key key) { + UErrorCode localStatus = U_ZERO_ERROR; + umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); + if (U_FAILURE(localStatus)) { + // TODO: This returns non-null in Java, and callers assume that. + return nullptr; + } + return gUnicodeSets[key]; +} + +Key unisets::chooseFrom(UnicodeString str, Key key1) { + return get(key1)->contains(str) ? key1 : COUNT; +} + +Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { + return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); +} + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_unisets.h b/icu4c/source/i18n/numparse_unisets.h new file mode 100644 index 00000000000..1d923613e98 --- /dev/null +++ b/icu4c/source/i18n/numparse_unisets.h @@ -0,0 +1,72 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +#ifndef __NUMPARSE_UNISETS_H__ +#define __NUMPARSE_UNISETS_H__ + +#include "numparse_types.h" +#include "unicode/uniset.h" + +U_NAMESPACE_BEGIN namespace numparse { +namespace impl { +namespace unisets { + +enum Key { + // Ignorables + BIDI, + WHITESPACE, + DEFAULT_IGNORABLES, + STRICT_IGNORABLES, + + // Separators + // Notes: + // - COMMA is a superset of STRICT_COMMA + // - PERIOD is a superset of SCRICT_PERIOD + // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS + // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS + COMMA, + PERIOD, + STRICT_COMMA, + STRICT_PERIOD, + OTHER_GROUPING_SEPARATORS, + ALL_SEPARATORS, + STRICT_ALL_SEPARATORS, + + // Symbols + // TODO: NaN? + MINUS_SIGN, + PLUS_SIGN, + PERCENT_SIGN, + PERMILLE_SIGN, + INFINITY, + + // Other + DIGITS, + NAN_LEAD, + SCIENTIFIC_LEAD, + CWCF, + + // Combined Separators with Digits (for lead code points) + DIGITS_OR_ALL_SEPARATORS, + DIGITS_OR_STRICT_ALL_SEPARATORS, + + // The number of elements in the enum. Also used to indicate null. + COUNT +}; + +const UnicodeSet* get(Key key); + +Key chooseFrom(UnicodeString str, Key key1); + +Key chooseFrom(UnicodeString str, Key key1, Key key2); + +} // namespace unisets +} // namespace impl +} // namespace numparse +U_NAMESPACE_END + +#endif //__NUMPARSE_UNISETS_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index 40a5c36d87a..d9e8741e7f2 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -26,6 +26,7 @@ as the functions are suppose to be called. It's usually best to have child dependencies called first. */ typedef enum ECleanupI18NType { UCLN_I18N_START = -1, + UCLN_I18N_NUMPARSE_UNISETS, UCLN_I18N_CURRENCY_SPACING, UCLN_I18N_SPOOF, UCLN_I18N_SPOOFDATA, diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index d41ef25f521..ed1aa256b14 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -64,7 +64,7 @@ scientificnumberformattertest.o datadrivennumberformattestsuite.o \ numberformattesttuple.o numberformat2test.o pluralmaptest.o \ numbertest_affixutils.o numbertest_api.o numbertest_decimalquantity.o \ numbertest_modifiers.o numbertest_patternmodifier.o numbertest_patternstring.o \ -numbertest_stringbuilder.o +numbertest_stringbuilder.o numbertest_stringsegment.o numbertest_unisets.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/numbertest.h b/icu4c/source/test/intltest/numbertest.h index 9d4ffb7cef0..60743ed5a1a 100644 --- a/icu4c/source/test/intltest/numbertest.h +++ b/icu4c/source/test/intltest/numbertest.h @@ -9,9 +9,13 @@ #include "number_stringbuilder.h" #include "intltest.h" #include "number_affixutils.h" +#include "numparse_stringsegment.h" +#include "unicode/locid.h" using namespace icu::number; using namespace icu::number::impl; +using namespace icu::numparse; +using namespace icu::numparse::impl; //////////////////////////////////////////////////////////////////////////////////////// // INSTRUCTIONS: // @@ -178,6 +182,30 @@ class NumberStringBuilderTest : public IntlTest { void assertEqualsImpl(const UnicodeString &a, const NumberStringBuilder &b); }; +class StringSegmentTest : public IntlTest { + public: + void testOffset(); + void testLength(); + void testCharAt(); + void testGetCodePoint(); + void testCommonPrefixLength(); + + void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0); +}; + +class UniSetsTest : public IntlTest { + public: + void testSetCoverage(); + + void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0); + + private: + void assertInSet(const UnicodeString& localeName, const UnicodeString &setName, + const UnicodeSet& set, const UnicodeString& str); + void assertInSet(const UnicodeString& localeName, const UnicodeString &setName, + const UnicodeSet& set, UChar32 cp); +}; + // NOTE: This macro is identical to the one in itformat.cpp #define TESTCLASS(id, TestClass) \ @@ -206,6 +234,8 @@ class NumberTest : public IntlTest { TESTCLASS(4, PatternModifierTest); TESTCLASS(5, PatternStringTest); TESTCLASS(6, NumberStringBuilderTest); + TESTCLASS(7, StringSegmentTest); + TESTCLASS(8, UniSetsTest); default: name = ""; break; // needed to end loop } } diff --git a/icu4c/source/test/intltest/numbertest_stringsegment.cpp b/icu4c/source/test/intltest/numbertest_stringsegment.cpp new file mode 100644 index 00000000000..519642e49a2 --- /dev/null +++ b/icu4c/source/test/intltest/numbertest_stringsegment.cpp @@ -0,0 +1,94 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numbertest.h" +#include "numparse_stringsegment.h" + +static const char16_t* SAMPLE_STRING = u"📻 radio 📻"; + +void StringSegmentTest::runIndexedTest(int32_t index, UBool exec, const char*&name, char*) { + if (exec) { + logln("TestSuite StringSegmentTest: "); + } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testOffset); + TESTCASE_AUTO(testLength); + TESTCASE_AUTO(testCharAt); + TESTCASE_AUTO(testGetCodePoint); + TESTCASE_AUTO(testCommonPrefixLength); + TESTCASE_AUTO_END; +} + +void StringSegmentTest::testOffset() { + StringSegment segment(SAMPLE_STRING); + assertEquals("Initial Offset", 0, segment.getOffset()); + segment.adjustOffset(3); + assertEquals("Adjust A", 3, segment.getOffset()); + segment.adjustOffset(2); + assertEquals("Adjust B", 5, segment.getOffset()); + segment.setOffset(4); + assertEquals("Set Offset", 4, segment.getOffset()); +} + +void StringSegmentTest::testLength() { + StringSegment segment(SAMPLE_STRING); + assertEquals("Initial length", 11, segment.length()); + segment.adjustOffset(3); + assertEquals("Adjust", 8, segment.length()); + segment.setLength(4); + assertEquals("Set Length", 4, segment.length()); + segment.setOffset(5); + assertEquals("After adjust offset", 2, segment.length()); + segment.resetLength(); + assertEquals("After reset length", 6, segment.length()); +} + +void StringSegmentTest::testCharAt() { + StringSegment segment(SAMPLE_STRING); + assertEquals("Initial", SAMPLE_STRING, segment.toUnicodeString()); + segment.adjustOffset(3); + assertEquals("After adjust-offset", UnicodeString(u"radio 📻"), segment.toUnicodeString()); + segment.setLength(5); + assertEquals("After adjust-length", UnicodeString(u"radio"), segment.toUnicodeString()); +} + +void StringSegmentTest::testGetCodePoint() { + StringSegment segment(SAMPLE_STRING); + assertEquals("Double-width code point", 0x1F4FB, segment.getCodePoint()); + segment.setLength(1); + assertEquals("Inalid A", -1, segment.getCodePoint()); + segment.resetLength(); + segment.adjustOffset(1); + assertEquals("Invalid B", -1, segment.getCodePoint()); + segment.adjustOffset(1); + assertEquals("Valid again", 0x20, segment.getCodePoint()); +} + +void StringSegmentTest::testCommonPrefixLength() { + StringSegment segment(SAMPLE_STRING); + assertEquals("", 11, segment.getCommonPrefixLength(SAMPLE_STRING)); + assertEquals("", 4, segment.getCommonPrefixLength(u"📻 r")); + assertEquals("", 3, segment.getCommonPrefixLength(u"📻 x")); + assertEquals("", 0, segment.getCommonPrefixLength(u"x")); + assertEquals("", 0, segment.getCommonPrefixLength(u"")); + segment.adjustOffset(3); + assertEquals("", 0, segment.getCommonPrefixLength(u"RADiO")); + assertEquals("", 5, segment.getCommonPrefixLength(u"radio")); + assertEquals("", 2, segment.getCommonPrefixLength(u"rafio")); + assertEquals("", 0, segment.getCommonPrefixLength(u"fadio")); + assertEquals("", 0, segment.getCommonPrefixLength(u"")); + segment.setLength(3); + assertEquals("", 3, segment.getCommonPrefixLength(u"radio")); + assertEquals("", 2, segment.getCommonPrefixLength(u"rafio")); + assertEquals("", 0, segment.getCommonPrefixLength(u"fadio")); + assertEquals("", 0, segment.getCommonPrefixLength(u"")); + segment.resetLength(); + segment.setOffset(11); // end of string + assertEquals("", 0, segment.getCommonPrefixLength(u"foo")); +} + +#endif diff --git a/icu4c/source/test/intltest/numbertest_unisets.cpp b/icu4c/source/test/intltest/numbertest_unisets.cpp new file mode 100644 index 00000000000..a41f3f6efb4 --- /dev/null +++ b/icu4c/source/test/intltest/numbertest_unisets.cpp @@ -0,0 +1,99 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numbertest.h" +#include "numparse_unisets.h" +#include "unicode/dcfmtsym.h" + +#include +#include + +using icu::numparse::impl::unisets::get; + +void UniSetsTest::runIndexedTest(int32_t index, UBool exec, const char*&name, char*) { + if (exec) { + logln("TestSuite UniSetsTest: "); + } + TESTCASE_AUTO_BEGIN; + TESTCASE_AUTO(testSetCoverage); + TESTCASE_AUTO_END; +} + +void UniSetsTest::testSetCoverage() { + UErrorCode status = U_ZERO_ERROR; + + // Lenient comma/period should be supersets of strict comma/period; + // it also makes the coverage logic cheaper. + assertTrue( + "COMMA should be superset of STRICT_COMMA", + get(unisets::COMMA)->containsAll(*get(unisets::STRICT_COMMA))); + assertTrue( + "PERIOD should be superset of STRICT_PERIOD", + get(unisets::PERIOD)->containsAll(*get(unisets::STRICT_PERIOD))); + + UnicodeSet decimals; + decimals.addAll(*get(unisets::STRICT_COMMA)); + decimals.addAll(*get(unisets::STRICT_PERIOD)); + decimals.freeze(); + UnicodeSet grouping; + grouping.addAll(decimals); + grouping.addAll(*get(unisets::OTHER_GROUPING_SEPARATORS)); + decimals.freeze(); + + const UnicodeSet &plusSign = *get(unisets::PLUS_SIGN); + const UnicodeSet &minusSign = *get(unisets::MINUS_SIGN); + const UnicodeSet &percent = *get(unisets::PERCENT_SIGN); + const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN); + const UnicodeSet &infinity = *get(unisets::INFINITY); + const UnicodeSet &nanLead = *get(unisets::NAN_LEAD); + const UnicodeSet &scientificLead = *get(unisets::SCIENTIFIC_LEAD); + + int32_t localeCount; + const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount); + for (int32_t i = 0; i < localeCount; i++) { + Locale locale = allAvailableLocales[i]; + DecimalFormatSymbols dfs(locale, status); + UnicodeString localeName; + locale.getDisplayName(localeName); + assertSuccess(UnicodeString("Making DFS for ") + localeName, status); + +#define ASSERT_IN_SET(name, foo) assertInSet(localeName, UnicodeString("" #name ""), name, foo) + ASSERT_IN_SET(decimals, dfs.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol)); + ASSERT_IN_SET(grouping, dfs.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol)); + ASSERT_IN_SET(plusSign, dfs.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol)); + ASSERT_IN_SET(minusSign, dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol)); + ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol)); + ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol)); + ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol)); + ASSERT_IN_SET(nanLead, dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0)); + ASSERT_IN_SET(nanLead, + u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0), 0)); + ASSERT_IN_SET(scientificLead, + u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol).char32At(0), 0)); + } +} + +void UniSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName, + const UnicodeSet &set, const UnicodeString &str) { + if (str.countChar32(0, str.length()) != 1) { + // Ignore locale strings with more than one code point (usually a bidi mark) + return; + } + assertInSet(localeName, setName, set, str.char32At(0)); +} + +void UniSetsTest::assertInSet(const UnicodeString &localeName, const UnicodeString &setName, + const UnicodeSet &set, UChar32 cp) { + // If this test case fails, add the specified code point to the corresponding set in + // UnicodeSetStaticCache.java and numparse_unisets.cpp + assertTrue( + localeName + UnicodeString(u" ") + UnicodeString(cp) + UnicodeString(u" is missing in ") + + setName, set.contains(cp)); +} + + +#endif