From: Markus Scherer Date: Tue, 29 Jun 2021 17:27:09 +0000 (+0000) Subject: ICU-21525 UnicodeSet.hasString(), UnicodeSetIterator.skipToStrings() & C API X-Git-Tag: cldr/2021-08-11~22 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e4e2ae9544d59f23ac14cbf5763570ef0826e561;p=icu ICU-21525 UnicodeSet.hasString(), UnicodeSetIterator.skipToStrings() & C API --- diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 8403c4026c3..4ebb5e71a91 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -771,8 +771,12 @@ public: * Note than the elements of a set may include both individual * codepoints and strings. * + * This is slower than getRangeCount() because + * it counts the code points of all ranges. + * * @return the number of elements in this set (its cardinality). * @stable ICU 2.0 + * @see getRangeCount */ virtual int32_t size(void) const; @@ -784,6 +788,14 @@ public: */ virtual UBool isEmpty(void) const; +#ifndef U_HIDE_DRAFT_API + /** + * @return true if this set contains multi-character strings or the empty string. + * @draft ICU 70 + */ + UBool hasStrings() const; +#endif // U_HIDE_DRAFT_API + /** * Returns true if this set contains the given character. * This function works faster with a frozen set. @@ -1064,8 +1076,14 @@ public: /** * Returns the character at the given index within this set, where * the set is ordered by ascending code point. If the index is - * out of range, return (UChar32)-1. The inverse of this method is - * indexOf(). + * out of range for characters, returns (UChar32)-1. + * The inverse of this method is indexOf(). + * + * For iteration, this is slower than UnicodeSetIterator or + * getRangeCount()/getRangeStart()/getRangeEnd(), + * because for each call it skips linearly over index + * characters in the ranges. + * * @param index an index from 0..size()-1 * @return the character at the given index, or (UChar32)-1. * @stable ICU 2.4 @@ -1567,7 +1585,6 @@ private: void swapBuffers(void); UBool allocateStrings(UErrorCode &status); - UBool hasStrings() const; int32_t stringsSize() const; UBool stringsContains(const UnicodeString &s) const; diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index 1d0daf9d098..3621230641c 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -851,6 +851,16 @@ uset_removeAllStrings(USet* set); U_CAPI UBool U_EXPORT2 uset_isEmpty(const USet* set); +#ifndef U_HIDE_DRAFT_API +/** + * @param set the set + * @return true if this set contains multi-character strings or the empty string. + * @draft ICU 70 + */ +U_CAPI UBool U_EXPORT2 +uset_hasStrings(const USet *set); +#endif // U_HIDE_DRAFT_API + /** * Returns true if the given USet contains the given character. * This function works faster with a frozen set. @@ -901,8 +911,13 @@ uset_indexOf(const USet* set, UChar32 c); /** * Returns the character at the given index within this set, where * the set is ordered by ascending code point. If the index is - * out of range, return (UChar32)-1. The inverse of this method is - * indexOf(). + * out of range for characters, returns (UChar32)-1. + * The inverse of this method is indexOf(). + * + * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount() + * with uset_getItem(), because for each call it skips linearly over index + * characters in the ranges. + * * @param set the set * @param charIndex an index from 0..size()-1 to obtain the char for * @return the character at the given index, or (UChar32)-1. @@ -912,16 +927,34 @@ U_CAPI UChar32 U_EXPORT2 uset_charAt(const USet* set, int32_t charIndex); /** - * Returns the number of characters and strings contained in the given - * USet. + * Returns the number of characters and strings contained in this set. + * The last (uset_getItemCount() - uset_getRangeCount()) items are strings. + * + * This is slower than uset_getRangeCount() and uset_getItemCount() because + * it counts the code points of all ranges. + * * @param set the set * @return a non-negative integer counting the characters and strings * contained in set * @stable ICU 2.4 + * @see uset_getRangeCount */ U_CAPI int32_t U_EXPORT2 uset_size(const USet* set); +#ifndef U_HIDE_DRAFT_API +/** + * @param set the set + * @return the number of ranges in this set. + * @draft ICU 70 + * @see uset_getItemCount + * @see uset_getItem + * @see uset_size + */ +U_CAPI int32_t U_EXPORT2 +uset_getRangeCount(const USet *set); +#endif // U_HIDE_DRAFT_API + /** * Returns the number of items in this set. An item is either a range * of characters or a single multicharacter string. @@ -935,20 +968,30 @@ uset_getItemCount(const USet* set); /** * Returns an item of this set. An item is either a range of - * characters or a single multicharacter string. + * characters or a single multicharacter string (which can be the empty string). + * + * If itemIndex is less than uset_getRangeCount(), then this function returns 0, + * and the range is *start..*end. + * + * If itemIndex is at least uset_getRangeCount() and less than uset_getItemCount(), then + * this function copies the string into str[strCapacity] and + * returns the length of the string (0 for the empty string). + * + * If itemIndex is out of range, then this function returns -1. + * + * Note that 0 is returned for each range as well as for the empty string. + * * @param set the set - * @param itemIndex a non-negative integer in the range 0.. - * uset_getItemCount(set)-1 - * @param start pointer to variable to receive first character - * in range, inclusive - * @param end pointer to variable to receive last character in range, - * inclusive + * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1 + * @param start pointer to variable to receive first character in range, inclusive; + * can be NULL for a string item + * @param end pointer to variable to receive last character in range, inclusive; + * can be NULL for a string item * @param str buffer to receive the string, may be NULL * @param strCapacity capacity of str, or 0 if str is NULL - * @param ec error code - * @return the length of the string (>= 2), or 0 if the item is a - * range, in which case it is the range *start..*end, or -1 if - * itemIndex is out of range + * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range + * @return the length of the string (0 or >= 2), or 0 if the item is a range, + * or -1 if the itemIndex is out of range * @stable ICU 2.4 */ U_CAPI int32_t U_EXPORT2 diff --git a/icu4c/source/common/unicode/usetiter.h b/icu4c/source/common/unicode/usetiter.h index a817ef72b3a..831565415db 100644 --- a/icu4c/source/common/unicode/usetiter.h +++ b/icu4c/source/common/unicode/usetiter.h @@ -60,6 +60,9 @@ class UnicodeString; * } * } * + * + * To iterate over only the strings, start with skipToStrings(). + * * @author M. Davis * @stable ICU 2.4 */ @@ -170,6 +173,25 @@ class U_COMMON_API UnicodeSetIterator : public UObject { */ const UnicodeString& getString(); +#ifndef U_HIDE_DRAFT_API + /** + * Skips over the remaining code points/ranges, if any. + * A following call to next() or nextRange() will yield a string, if there is one. + * No-op if next() would return false, or if it would yield a string anyway. + * + * @return *this + * @draft ICU 70 + * @see UnicodeSet#strings() + */ + inline UnicodeSetIterator &skipToStrings() { + // Finish code point/range iteration. + range = endRange; + endElement = -1; + nextElement = 0; + return *this; + } +#endif // U_HIDE_DRAFT_API + /** * Advances the iteration position to the next element in the set, * which can be either a single code point or a string. @@ -281,13 +303,16 @@ class U_COMMON_API UnicodeSetIterator : public UObject { */ int32_t stringCount; + private: + /** * Points to the string to use when the caller asks for a * string and the current iteration item is a code point, not a string. - * @internal */ UnicodeString *cpString; + protected: + /** Copy constructor. Disallowed. * @stable ICU 2.4 */ @@ -306,7 +331,7 @@ class U_COMMON_API UnicodeSetIterator : public UObject { }; inline UBool UnicodeSetIterator::isString() const { - return codepoint == (UChar32)IS_STRING; + return codepoint < 0; } inline UChar32 UnicodeSetIterator::getCodepoint() const { diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp index a7e3046dbf2..c131fd91e13 100644 --- a/icu4c/source/common/uset.cpp +++ b/icu4c/source/common/uset.cpp @@ -196,6 +196,11 @@ uset_isEmpty(const USet* set) { return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); } +U_CAPI UBool U_EXPORT2 +uset_hasStrings(const USet* set) { + return ((const UnicodeSet*) set)->UnicodeSet::hasStrings(); +} + U_CAPI UBool U_EXPORT2 uset_contains(const USet* set, UChar32 c) { return ((const UnicodeSet*) set)->UnicodeSet::contains(c); @@ -296,6 +301,11 @@ private: }; U_NAMESPACE_END +U_CAPI int32_t U_EXPORT2 +uset_getRangeCount(const USet *set) { + return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount(); +} + U_CAPI int32_t U_EXPORT2 uset_getItemCount(const USet* uset) { const UnicodeSet& set = *(const UnicodeSet*)uset; @@ -330,11 +340,6 @@ uset_getItem(const USet* uset, int32_t itemIndex, } } -//U_CAPI int32_t U_EXPORT2 -//uset_getRangeCount(const USet* set) { -// return ((const UnicodeSet*) set)->getRangeCount(); -//} -// //U_CAPI UBool U_EXPORT2 //uset_getRange(const USet* set, int32_t rangeIndex, // UChar32* pStart, UChar32* pEnd) { diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c index 9fe2362fa22..87f8fc2b38b 100644 --- a/icu4c/source/test/cintltst/usettest.c +++ b/icu4c/source/test/cintltst/usettest.c @@ -6,12 +6,15 @@ * Corporation and others. All Rights Reserved. ********************************************************************** */ + +#include +#include +#include + #include "unicode/uset.h" #include "unicode/ustring.h" #include "cintltst.h" #include "cmemory.h" -#include -#include #define TEST(x) addTest(root, &x, "uset/" # x) @@ -101,6 +104,9 @@ static void TestAPI() { /* [ABC] */ set = uset_open(0x0041, 0x0043); expect(set, "ABC", "DEF{ab}", NULL); + if(uset_hasStrings(set)) { + log_err("uset_hasStrings([ABC]) = true"); + } uset_close(set); /* [a-c{ab}] */ @@ -113,6 +119,9 @@ static void TestAPI() { if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { log_err("uset_resemblesPattern of PAT failed\n"); } + if(!uset_hasStrings(set)) { + log_err("uset_hasStrings([a-c{ab}]) = false"); + } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ @@ -167,6 +176,9 @@ static void TestAPI() { return; } expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); + if (uset_size(set) != 22 || uset_getRangeCount(set) != 3 || uset_getItemCount(set) != 3) { + log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__); + } /* [ab] */ uset_clear(set); @@ -243,6 +255,9 @@ static void TestAPI() { return; } expect(set, "abcdef{ch}{sch}", "", NULL); + if (uset_size(set) != 8 || uset_getRangeCount(set) != 1 || uset_getItemCount(set) != 3) { + log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__); + } uset_retainString(set, u"sch", 3); expect(set, "{sch}", "abcdef{ch}", NULL); @@ -400,10 +415,12 @@ static void expectItems(const USet* set, char *pat; UErrorCode ec; int32_t expectedSize = 0; + int32_t rangeCount = uset_getRangeCount(set); int32_t itemCount = uset_getItemCount(set); int32_t itemIndex = 0; UChar32 start = 1, end = 0; int32_t itemLen = 0, length; + bool isString = false; ec = U_ZERO_ERROR; length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec); @@ -435,17 +452,26 @@ static void expectItems(const USet* set, return; } - itemLen = uset_getItem(set, itemIndex, &start, &end, - itemStr, sizeof(itemStr), &ec); + // Pass in NULL pointers where we expect them to be ok. + if (itemIndex < rangeCount) { + itemLen = uset_getItem(set, itemIndex, &start, &end, NULL, 0, &ec); + } else { + itemLen = uset_getItem(set, itemIndex, NULL, NULL, + itemStr, UPRV_LENGTHOF(itemStr), &ec); + isString = true; + } if (U_FAILURE(ec) || itemLen < 0) { log_err("FAIL: uset_getItem => %s\n", u_errorName(ec)); return; } - if (itemLen == 0) { + if (!isString) { log_verbose("Ok: %s item %d is %c-%c\n", pat, itemIndex, oneUCharToChar(start), oneUCharToChar(end)); + if (itemLen != 0) { + log_err("FAIL: uset_getItem(%d) => length %d\n", itemIndex, itemLen); + } } else { itemStr[itemLen] = 0; u_UCharsToChars(itemStr, buf, itemLen+1); @@ -469,7 +495,7 @@ static void expectItems(const USet* set, u_charsToUChars(stringStart, ustr, stringLength); ustr[stringLength] = 0; - if (itemLen == 0) { + if (!isString) { log_err("FAIL: for %s expect \"%s\" next, but got a char\n", pat, strCopy); return; @@ -488,18 +514,19 @@ static void expectItems(const USet* set, u_charsToUChars(p, ustr, 1); c = ustr[0]; - if (itemLen != 0) { + if (isString) { log_err("FAIL: for %s expect '%c' next, but got a string\n", pat, *p); return; } - if (c != start++) { + if (c != start) { log_err("FAIL: for %s expect '%c' next\n", pat, *p); return; } + ++start; ++p; } } diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index d285f0ae0c7..69e12de2f7e 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -780,7 +780,7 @@ void CharsetDetectionTest::Ticket6394Test() { return; } - UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. + UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings. int32_t i; for (i=0; i nextElement + perRange)) { - endElement = nextElement + perRange; + const UnicodeSet *p = &set; + bool unchanged = true; + for (int32_t i = 0; i < rangeCount; ++i) { + int32_t start = set.getRangeStart(i); + int32_t end = set.getRangeEnd(i); + int32_t newEnd = start + perRange; + if (end > newEnd) { + if (unchanged) { + copy = set; + p = © + unchanged = false; + } + copy.remove(newEnd + 1, end); + } } + return *p; } +} // namespace + //-------------------------------------------------------------------- // RTTest Interface //-------------------------------------------------------------------- @@ -587,8 +561,8 @@ void RTTest::test2(UBool quickRt, int32_t density) { return; } - AbbreviatedUnicodeSetIterator usi; - AbbreviatedUnicodeSetIterator usi2; + UnicodeSetIterator usi; + UnicodeSetIterator usi2; parent->logln("Checking that at least one irrelevant character is not NFC'ed"); // string is from NFC_NO in the UCD @@ -702,13 +676,14 @@ void RTTest::test2(UBool quickRt, int32_t density) { UnicodeSet sourceRangeMinusFailures(sourceRange); sourceRangeMinusFailures.removeAll(failSourceTarg); - - usi.reset(sourceRangeMinusFailures, quickRt, density); + + UnicodeSet copy, copy2; + usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy)); for (;;) { if (!usi.next() || usi.isString()) break; UChar32 c = usi.getCodepoint(); - usi2.reset(sourceRangeMinusFailures, quickRt, density); + usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy2)); for (;;) { if (!usi2.next() || usi2.isString()) break; UChar32 d = usi2.getCodepoint(); @@ -816,7 +791,7 @@ void RTTest::test2(UBool quickRt, int32_t density) { targetRangeMinusFailures.removeAll(failTargSource); targetRangeMinusFailures.removeAll(failRound); - usi.reset(targetRangeMinusFailures, quickRt, density); + usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy)); UnicodeString targ2; UnicodeString reverse2; UnicodeString targD; @@ -830,7 +805,7 @@ void RTTest::test2(UBool quickRt, int32_t density) { return; } - usi2.reset(targetRangeMinusFailures, quickRt, density); + usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy2)); for (;;) { if (!usi2.next() || usi2.isString()) break; diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index a19f229a3a7..56bdc629651 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -99,6 +99,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestUnusedCcc); TESTCASE_AUTO(TestDeepPattern); TESTCASE_AUTO(TestEmptyString); + TESTCASE_AUTO(TestSkipToStrings); TESTCASE_AUTO_END; } @@ -882,6 +883,8 @@ void UnicodeSetTest::TestStrings() { if (U_FAILURE(ec)) { errln("FAIL: couldn't construct test sets"); } + assertFalse("[a-c].hasStrings()", testList[0]->hasStrings()); + assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings()); for (int32_t i = 0; testList[i] != NULL; i+=2) { if (U_SUCCESS(ec)) { @@ -896,7 +899,7 @@ void UnicodeSetTest::TestStrings() { } delete testList[i]; delete testList[i+1]; - } + } } /** @@ -4059,3 +4062,49 @@ void UnicodeSetTest::TestEmptyString() { assertTrue("frozen containsNone", set.containsNone(u"def")); assertFalse("frozen containsSome", set.containsSome(u"def")); } + +void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) { + assertTrue(expected + ".next()", iter.next()); + assertEquals(expected + ".getString()", expected, iter.getString()); +} + +void UnicodeSetTest::TestSkipToStrings() { + IcuTestErrorCode errorCode(*this, "TestSkipToStrings"); + UnicodeSet set(u"[0189{}{ch}]", errorCode); + UnicodeSetIterator iter(set); + assertNext(iter.skipToStrings(), u""); + assertNext(iter, u"ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + assertNext(iter, u"0"); + assertNext(iter, u"1"); + assertNext(iter, u"8"); + assertNext(iter, u"9"); + assertNext(iter, u""); + assertNext(iter, u"ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + assertNext(iter, u"0"); + iter.skipToStrings(); + assertNext(iter, u""); + assertNext(iter, u"ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + iter.nextRange(); + assertNext(iter, u"8"); + iter.skipToStrings(); + assertNext(iter, u""); + assertNext(iter, u"ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + iter.nextRange(); + iter.nextRange(); + iter.nextRange(); + iter.skipToStrings(); + assertNext(iter, u"ch"); + assertFalse("no next", iter.next()); +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 7d2816a3d99..44d1f02dbb6 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -19,6 +19,7 @@ #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/ucnv_err.h" +#include "unicode/usetiter.h" #include "intltest.h" #include "cmemory.h" @@ -96,6 +97,9 @@ private: void TestDeepPattern(); void TestEmptyString(); + void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected); + void TestSkipToStrings(); + private: UBool toPatternAux(UChar32 start, UChar32 end); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java index 7a1f6c0b48e..715c0f28ef6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java @@ -487,9 +487,8 @@ public final class NumberFormatter { *
  • AUTO: 0.90, 1.00, 1.10 *
  • HIDE_IF_WHOLE: 0.90, 1, 1.10 * - * + * * @draft ICU 69 - * @provisional This API might change or be removed in a future release. */ public static enum TrailingZeroDisplay { /** @@ -498,7 +497,7 @@ public final class NumberFormatter { * @draft ICU 69 */ AUTO, - + /** * Same as AUTO, but hide trailing zeros after the decimal separator if they are all zero. * diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index d41ff99ea80..4053242b87b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -829,10 +829,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } - boolean hasStrings() { - return !strings.isEmpty(); - } - /** * Returns the number of elements in this set (its cardinality) * Note than the elements of a set may include both individual @@ -860,6 +856,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa return len == 1 && !hasStrings(); } + /** + * @return true if this set contains multi-character strings or the empty string. + * @draft ICU 70 + */ + public boolean hasStrings() { + return !strings.isEmpty(); + } + /** * Implementation of UnicodeMatcher API. Returns true if * this set contains any character whose low byte is the given diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java index 2780104485e..88f0c9cd635 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java @@ -14,7 +14,7 @@ import java.util.Iterator; * UnicodeSetIterator iterates over the contents of a UnicodeSet. It * iterates over either code points or code point ranges. After all * code points or ranges have been returned, it returns the - * multicharacter strings of the UnicodSet, if any. + * multicharacter strings of the UnicodeSet, if any. * *

    To iterate over code points and multicharacter strings, * use a loop like this: @@ -34,10 +34,16 @@ import java.util.Iterator; * } * } * + * + *

    To iterate over only the strings, start with new UnicodeSetIterator(set).skipToStrings(). + * *

    Warning: For speed, UnicodeSet iteration does not check for concurrent modification. * Do not alter the UnicodeSet while iterating. * @author M. Davis * @stable ICU 2.0 + * @see UnicodeSet#ranges() + * @see UnicodeSet#strings() + * @see UnicodeSet#iterator() */ public class UnicodeSetIterator { @@ -94,6 +100,23 @@ public class UnicodeSetIterator { reset(new UnicodeSet()); } + /** + * Skips over the remaining code points/ranges, if any. + * A following call to next() or nextRange() will yield a string, if there is one. + * No-op if next() would return false, or if it would yield a string anyway. + * + * @return this + * @draft ICU 70 + * @see UnicodeSet#strings() + */ + public UnicodeSetIterator skipToStrings() { + // Finish code point/range iteration. + range = endRange; + endElement = -1; + nextElement = 0; + return this; + } + /** * Returns the next element in the set, either a single code point * or a string. If there are no more elements in the set, return @@ -234,39 +257,15 @@ public class UnicodeSetIterator { private int endRange = 0; private int range = 0; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - public UnicodeSet getSet() { - return set; - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected int endElement; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected int nextElement; - private Iterator stringIterator = null; + private int endElement; + private int nextElement; /** * Invariant: stringIterator is null when there are no (more) strings remaining */ + private Iterator stringIterator = null; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected void loadRange(int aRange) { + private void loadRange(int aRange) { nextElement = set.getRangeStart(aRange); endElement = set.getRangeEnd(aRange); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java index f3c1b0fe088..0edc7fd6b06 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java @@ -819,6 +819,8 @@ public class UnicodeSetTest extends TestFmwk { {new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'), new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")}, }; + assertFalse("[a-c].hasStrings()", testList[0][0].hasStrings()); + assertTrue("[{ll}{ch}a-z].hasStrings()", testList[1][0].hasStrings()); for (int i = 0; i < testList.length; ++i) { if (!testList[i][0].equals(testList[i][1])) { @@ -2420,17 +2422,6 @@ public class UnicodeSetTest extends TestFmwk { return Utility.unescape(s); } - /* Test the method public UnicodeSet getSet() */ - @Test - public void TestGetSet() { - UnicodeSetIterator us = new UnicodeSetIterator(); - try { - us.getSet(); - } catch (Exception e) { - errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception."); - } - } - /* Tests the method public UnicodeSet add(Collection source) */ @Test public void TestAddCollection() { @@ -2840,4 +2831,50 @@ public class UnicodeSetTest extends TestFmwk { assertTrue("frozen containsNone", set.containsNone("def")); assertFalse("frozen containsSome", set.containsSome("def")); } + + private void assertNext(UnicodeSetIterator iter, String expected) { + assertTrue(expected + ".next()", iter.next()); + assertEquals(expected + ".getString()", expected, iter.getString()); + } + + @Test + public void TestSkipToStrings() { + UnicodeSet set = new UnicodeSet("[0189{}{ch}]"); + UnicodeSetIterator iter = new UnicodeSetIterator(set).skipToStrings(); + assertNext(iter, ""); + assertNext(iter, "ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + assertNext(iter, "0"); + assertNext(iter, "1"); + assertNext(iter, "8"); + assertNext(iter, "9"); + assertNext(iter, ""); + assertNext(iter, "ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + assertNext(iter, "0"); + iter.skipToStrings(); + assertNext(iter, ""); + assertNext(iter, "ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + iter.nextRange(); + assertNext(iter, "8"); + iter.skipToStrings(); + assertNext(iter, ""); + assertNext(iter, "ch"); + assertFalse("no next", iter.next()); + + iter.reset(); + iter.nextRange(); + iter.nextRange(); + iter.nextRange(); + iter.skipToStrings(); + assertNext(iter, "ch"); + assertFalse("no next", iter.next()); + } } diff --git a/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java b/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java index ba32ab007f7..657e653db98 100644 --- a/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java +++ b/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java @@ -17,7 +17,7 @@ import com.ibm.icu.text.UTF16; * UnicodeSetIterator iterates over the contents of a UnicodeSet. It * iterates over either code points or code point ranges. After all * code points or ranges have been returned, it returns the - * multicharacter strings of the UnicodSet, if any. + * multicharacter strings of the UnicodeSet, if any. * *

    To iterate over code points, use a loop like this: *

    @@ -106,7 +106,7 @@ public class UnicodeMapIterator {
          * false.  If codepoint == IS_STRING, the value is a
          * string in the string field.  Otherwise the value is a
          * single code point in the codepoint field.
    -     * 
    +     *
          * 

    The order of iteration is all code points in sorted order, * followed by all strings sorted order. codepointEnd is * undefined after calling this method. string is @@ -135,7 +135,7 @@ public class UnicodeMapIterator { if (stringIterator == null) return false; codepoint = IS_STRING; // signal that value is actually a string - string = (String)stringIterator.next(); + string = stringIterator.next(); if (!stringIterator.hasNext()) stringIterator = null; return true; } @@ -147,7 +147,7 @@ public class UnicodeMapIterator { * string in the string field. Otherwise the value is a * range of one or more code points from codepoint to * codepointeEnd inclusive. - * + * *

    The order of iteration is all code points ranges in sorted * order, followed by all strings sorted order. Ranges are * disjoint and non-contiguous. string is undefined @@ -180,7 +180,7 @@ public class UnicodeMapIterator { if (stringIterator == null) return false; codepoint = IS_STRING; // signal that value is actually a string - string = (String)stringIterator.next(); + string = stringIterator.next(); if (!stringIterator.hasNext()) stringIterator = null; return true; } @@ -198,13 +198,13 @@ public class UnicodeMapIterator { /** * Resets this iterator to the start of the set. - * @return + * @return */ public UnicodeMapIterator reset() { endRange = map.getRangeCount() - 1; // both next*() methods will test: if (nextElement <= endElement) // we set them to fail this test, which will cause them to load the first range - nextElement = 0; + nextElement = 0; endElement = -1; range = -1; diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java index 2cbf17310a6..8284cd09fe7 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java @@ -77,45 +77,33 @@ public class RoundTripTest extends TestFmwk { static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; static String HIRAGANA_ITERATION = "[\u309D\u309E]"; - //------------------------------------------------------------------ - // AbbreviatedUnicodeSetIterator - //------------------------------------------------------------------ - - static class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator { - - private boolean abbreviated; - private int perRange; - - public AbbreviatedUnicodeSetIterator() { - super(); - abbreviated = false; - } - - @Override - public void reset(UnicodeSet newSet) { - reset(newSet, false); - } - - public void reset(UnicodeSet newSet, boolean abb) { - reset(newSet, abb, 100); + /** + * If abbreviated=true, returns a set which only a sampling of the original code points. + * density is the approximate total number of code points to returned for the entire set. + */ + private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated, int density) { + if (!abbreviated) { + return set; } - - public void reset(UnicodeSet newSet, boolean abb, int density) { - super.reset(newSet); - abbreviated = abb; - perRange = newSet.getRangeCount(); - if (perRange != 0) { - perRange = density / perRange; - } + int rangeCount = set.getRangeCount(); + int perRange = rangeCount; + if (perRange != 0) { + perRange = density / perRange; } - - @Override - protected void loadRange(int myRange) { - super.loadRange(myRange); - if (abbreviated && (endElement > nextElement + perRange)) { - endElement = nextElement + perRange; + boolean unchanged = true; + for (int i = 0; i < rangeCount; ++i) { + int start = set.getRangeStart(i); + int end = set.getRangeEnd(i); + int newEnd = start + perRange; + if (end > newEnd) { + if (unchanged) { + set = set.cloneAsThawed(); + unchanged = false; + } + set.remove(newEnd + 1, end); } } + return set; } //-------------------------------------------------------------------- @@ -1295,8 +1283,8 @@ public class RoundTripTest extends TestFmwk { return false; } - AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator(); - AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator(); + UnicodeSetIterator usi = new UnicodeSetIterator(); + UnicodeSetIterator usi2 = new UnicodeSetIterator(); Transliterator sourceToTarget; Transliterator targetToSource; @@ -1454,7 +1442,7 @@ public class RoundTripTest extends TestFmwk { boolean quickRt = TestFmwk.getExhaustiveness() < 10; - usi.reset(sourceRangeMinusFailures, quickRt, density); + usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density)); while (usi.next()) { int c = usi.codepoint; @@ -1466,7 +1454,7 @@ public class RoundTripTest extends TestFmwk { if (failSourceTarg.get(d)) continue; */ TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c)); - usi2.reset(sourceRangeMinusFailures, quickRt, density); + usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density)); while (usi2.next()) { int d = usi2.codepoint; @@ -1561,7 +1549,7 @@ public class RoundTripTest extends TestFmwk { !targetRange.contains(c)) continue; */ - usi.reset(targetRangeMinusFailures, quickRt, density); + usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density)); while (usi.next()) { int c = usi.codepoint; @@ -1574,7 +1562,7 @@ public class RoundTripTest extends TestFmwk { !targetRange.contains(d)) continue; */ TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c)); - usi2.reset(targetRangeMinusFailures, quickRt, density); + usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density)); while (usi2.next()) {