* Note than the elements of a set may include both individual
* codepoints and strings.
*
+ * This is slower than getRangeCount() because
+ * it counts the code points of all ranges.
+ *
* @return the number of elements in this set (its cardinality).
* @stable ICU 2.0
+ * @see getRangeCount
*/
virtual int32_t size(void) const;
*/
virtual UBool isEmpty(void) const;
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * @return true if this set contains multi-character strings or the empty string.
+ * @draft ICU 70
+ */
+ UBool hasStrings() const;
+#endif // U_HIDE_DRAFT_API
+
/**
* Returns true if this set contains the given character.
* This function works faster with a frozen set.
/**
* Returns the character at the given index within this set, where
* the set is ordered by ascending code point. If the index is
- * out of range, return (UChar32)-1. The inverse of this method is
- * <code>indexOf()</code>.
+ * out of range for characters, returns (UChar32)-1.
+ * The inverse of this method is <code>indexOf()</code>.
+ *
+ * For iteration, this is slower than UnicodeSetIterator or
+ * getRangeCount()/getRangeStart()/getRangeEnd(),
+ * because for each call it skips linearly over <code>index</code>
+ * characters in the ranges.
+ *
* @param index an index from 0..size()-1
* @return the character at the given index, or (UChar32)-1.
* @stable ICU 2.4
void swapBuffers(void);
UBool allocateStrings(UErrorCode &status);
- UBool hasStrings() const;
int32_t stringsSize() const;
UBool stringsContains(const UnicodeString &s) const;
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet* set);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * @param set the set
+ * @return true if this set contains multi-character strings or the empty string.
+ * @draft ICU 70
+ */
+U_CAPI UBool U_EXPORT2
+uset_hasStrings(const USet *set);
+#endif // U_HIDE_DRAFT_API
+
/**
* Returns true if the given USet contains the given character.
* This function works faster with a frozen set.
/**
* Returns the character at the given index within this set, where
* the set is ordered by ascending code point. If the index is
- * out of range, return (UChar32)-1. The inverse of this method is
- * <code>indexOf()</code>.
+ * out of range for characters, returns (UChar32)-1.
+ * The inverse of this method is <code>indexOf()</code>.
+ *
+ * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
+ * with uset_getItem(), because for each call it skips linearly over <code>index</code>
+ * characters in the ranges.
+ *
* @param set the set
* @param charIndex an index from 0..size()-1 to obtain the char for
* @return the character at the given index, or (UChar32)-1.
uset_charAt(const USet* set, int32_t charIndex);
/**
- * Returns the number of characters and strings contained in the given
- * USet.
+ * Returns the number of characters and strings contained in this set.
+ * The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
+ *
+ * This is slower than uset_getRangeCount() and uset_getItemCount() because
+ * it counts the code points of all ranges.
+ *
* @param set the set
* @return a non-negative integer counting the characters and strings
* contained in set
* @stable ICU 2.4
+ * @see uset_getRangeCount
*/
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * @param set the set
+ * @return the number of ranges in this set.
+ * @draft ICU 70
+ * @see uset_getItemCount
+ * @see uset_getItem
+ * @see uset_size
+ */
+U_CAPI int32_t U_EXPORT2
+uset_getRangeCount(const USet *set);
+#endif // U_HIDE_DRAFT_API
+
/**
* Returns the number of items in this set. An item is either a range
* of characters or a single multicharacter string.
/**
* Returns an item of this set. An item is either a range of
- * characters or a single multicharacter string.
+ * characters or a single multicharacter string (which can be the empty string).
+ *
+ * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
+ * and the range is <code>*start</code>..<code>*end</code>.
+ *
+ * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
+ * this function copies the string into <code>str[strCapacity]</code> and
+ * returns the length of the string (0 for the empty string).
+ *
+ * If <code>itemIndex</code> is out of range, then this function returns -1.
+ *
+ * Note that 0 is returned for each range as well as for the empty string.
+ *
* @param set the set
- * @param itemIndex a non-negative integer in the range 0..
- * uset_getItemCount(set)-1
- * @param start pointer to variable to receive first character
- * in range, inclusive
- * @param end pointer to variable to receive last character in range,
- * inclusive
+ * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
+ * @param start pointer to variable to receive first character in range, inclusive;
+ * can be NULL for a string item
+ * @param end pointer to variable to receive last character in range, inclusive;
+ * can be NULL for a string item
* @param str buffer to receive the string, may be NULL
* @param strCapacity capacity of str, or 0 if str is NULL
- * @param ec error code
- * @return the length of the string (>= 2), or 0 if the item is a
- * range, in which case it is the range *start..*end, or -1 if
- * itemIndex is out of range
+ * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
+ * @return the length of the string (0 or >= 2), or 0 if the item is a range,
+ * or -1 if the itemIndex is out of range
* @stable ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
* }
* }
* </pre>
+ *
+ * To iterate over only the strings, start with <code>skipToStrings()</code>.
+ *
* @author M. Davis
* @stable ICU 2.4
*/
*/
const UnicodeString& getString();
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * Skips over the remaining code points/ranges, if any.
+ * A following call to next() or nextRange() will yield a string, if there is one.
+ * No-op if next() would return false, or if it would yield a string anyway.
+ *
+ * @return *this
+ * @draft ICU 70
+ * @see UnicodeSet#strings()
+ */
+ inline UnicodeSetIterator &skipToStrings() {
+ // Finish code point/range iteration.
+ range = endRange;
+ endElement = -1;
+ nextElement = 0;
+ return *this;
+ }
+#endif // U_HIDE_DRAFT_API
+
/**
* Advances the iteration position to the next element in the set,
* which can be either a single code point or a string.
*/
int32_t stringCount;
+ private:
+
/**
* Points to the string to use when the caller asks for a
* string and the current iteration item is a code point, not a string.
- * @internal
*/
UnicodeString *cpString;
+ protected:
+
/** Copy constructor. Disallowed.
* @stable ICU 2.4
*/
};
inline UBool UnicodeSetIterator::isString() const {
- return codepoint == (UChar32)IS_STRING;
+ return codepoint < 0;
}
inline UChar32 UnicodeSetIterator::getCodepoint() const {
return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
}
+U_CAPI UBool U_EXPORT2
+uset_hasStrings(const USet* set) {
+ return ((const UnicodeSet*) set)->UnicodeSet::hasStrings();
+}
+
U_CAPI UBool U_EXPORT2
uset_contains(const USet* set, UChar32 c) {
return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
};
U_NAMESPACE_END
+U_CAPI int32_t U_EXPORT2
+uset_getRangeCount(const USet *set) {
+ return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount();
+}
+
U_CAPI int32_t U_EXPORT2
uset_getItemCount(const USet* uset) {
const UnicodeSet& set = *(const UnicodeSet*)uset;
}
}
-//U_CAPI int32_t U_EXPORT2
-//uset_getRangeCount(const USet* set) {
-// return ((const UnicodeSet*) set)->getRangeCount();
-//}
-//
//U_CAPI UBool U_EXPORT2
//uset_getRange(const USet* set, int32_t rangeIndex,
// UChar32* pStart, UChar32* pEnd) {
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "cintltst.h"
#include "cmemory.h"
-#include <stdlib.h>
-#include <string.h>
#define TEST(x) addTest(root, &x, "uset/" # x)
/* [ABC] */
set = uset_open(0x0041, 0x0043);
expect(set, "ABC", "DEF{ab}", NULL);
+ if(uset_hasStrings(set)) {
+ log_err("uset_hasStrings([ABC]) = true");
+ }
uset_close(set);
/* [a-c{ab}] */
if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
log_err("uset_resemblesPattern of PAT failed\n");
}
+ if(!uset_hasStrings(set)) {
+ log_err("uset_hasStrings([a-c{ab}]) = false");
+ }
expect(set, "abc{ab}", "def{bc}", &ec);
/* [a-d{ab}] */
return;
}
expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
+ if (uset_size(set) != 22 || uset_getRangeCount(set) != 3 || uset_getItemCount(set) != 3) {
+ log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
+ }
/* [ab] */
uset_clear(set);
return;
}
expect(set, "abcdef{ch}{sch}", "", NULL);
+ if (uset_size(set) != 8 || uset_getRangeCount(set) != 1 || uset_getItemCount(set) != 3) {
+ log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
+ }
uset_retainString(set, u"sch", 3);
expect(set, "{sch}", "abcdef{ch}", NULL);
char *pat;
UErrorCode ec;
int32_t expectedSize = 0;
+ int32_t rangeCount = uset_getRangeCount(set);
int32_t itemCount = uset_getItemCount(set);
int32_t itemIndex = 0;
UChar32 start = 1, end = 0;
int32_t itemLen = 0, length;
+ bool isString = false;
ec = U_ZERO_ERROR;
length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
return;
}
- itemLen = uset_getItem(set, itemIndex, &start, &end,
- itemStr, sizeof(itemStr), &ec);
+ // Pass in NULL pointers where we expect them to be ok.
+ if (itemIndex < rangeCount) {
+ itemLen = uset_getItem(set, itemIndex, &start, &end, NULL, 0, &ec);
+ } else {
+ itemLen = uset_getItem(set, itemIndex, NULL, NULL,
+ itemStr, UPRV_LENGTHOF(itemStr), &ec);
+ isString = true;
+ }
if (U_FAILURE(ec) || itemLen < 0) {
log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
return;
}
- if (itemLen == 0) {
+ if (!isString) {
log_verbose("Ok: %s item %d is %c-%c\n", pat,
itemIndex, oneUCharToChar(start),
oneUCharToChar(end));
+ if (itemLen != 0) {
+ log_err("FAIL: uset_getItem(%d) => length %d\n", itemIndex, itemLen);
+ }
} else {
itemStr[itemLen] = 0;
u_UCharsToChars(itemStr, buf, itemLen+1);
u_charsToUChars(stringStart, ustr, stringLength);
ustr[stringLength] = 0;
- if (itemLen == 0) {
+ if (!isString) {
log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
pat, strCopy);
return;
u_charsToUChars(p, ustr, 1);
c = ustr[0];
- if (itemLen != 0) {
+ if (isString) {
log_err("FAIL: for %s expect '%c' next, but got a string\n",
pat, *p);
return;
}
- if (c != start++) {
+ if (c != start) {
log_err("FAIL: for %s expect '%c' next\n",
pat, *p);
return;
}
+ ++start;
++p;
}
}
return;
}
- UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
+ UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings.
int32_t i;
for (i=0; i<matchCount; i++) {
UnicodeString charSetName(ucsdet_getName(matches[i], &status));
return FALSE;
}
-// AbbreviatedUnicodeSetIterator Interface ---------------------------------------------
-//
-// Iterate over a UnicodeSet, only returning a sampling of the contained code points.
-// density is the approximate total number of code points to returned for the entire set.
-//
+namespace {
-class AbbreviatedUnicodeSetIterator : public UnicodeSetIterator {
-public :
-
- AbbreviatedUnicodeSetIterator();
- virtual ~AbbreviatedUnicodeSetIterator();
- void reset(UnicodeSet& set, UBool abb = FALSE, int32_t density = 100);
-
- /**
- * ICU "poor man's RTTI", returns a UClassID for this class.
- */
- static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
-
- /**
- * ICU "poor man's RTTI", returns a UClassID for the actual class.
- */
- virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
-
-private :
- UBool abbreviated;
- int32_t perRange; // The maximum number of code points to be returned from each range
- virtual void loadRange(int32_t range);
-
- /**
- * The address of this static class variable serves as this class's ID
- * for ICU "poor man's RTTI".
- */
- static const char fgClassID;
-};
-
-// AbbreviatedUnicodeSetIterator Implementation ---------------------------------------
-
-const char AbbreviatedUnicodeSetIterator::fgClassID=0;
-
-AbbreviatedUnicodeSetIterator::AbbreviatedUnicodeSetIterator() :
- UnicodeSetIterator(), abbreviated(FALSE) {
-}
-
-AbbreviatedUnicodeSetIterator::~AbbreviatedUnicodeSetIterator() {
-}
-
-void AbbreviatedUnicodeSetIterator::reset(UnicodeSet& newSet, UBool abb, int32_t density) {
- UnicodeSetIterator::reset(newSet);
- abbreviated = abb;
- perRange = newSet.getRangeCount();
+/**
+ * If abbreviated=true, returns a set which only a sampling of the original code points.
+ * density is the approximate total number of code points to returned for the entire set.
+ */
+const UnicodeSet &abbreviateSet(const UnicodeSet &set, bool abbreviated, int density,
+ UnicodeSet ©) {
+ if (!abbreviated) {
+ return set;
+ }
+ int32_t rangeCount = set.getRangeCount();
+ int32_t perRange = rangeCount;
if (perRange != 0) {
perRange = density / perRange;
}
-}
-
-void AbbreviatedUnicodeSetIterator::loadRange(int32_t myRange) {
- UnicodeSetIterator::loadRange(myRange);
- if (abbreviated && (endElement > nextElement + perRange)) {
- endElement = nextElement + perRange;
+ const UnicodeSet *p = &set;
+ bool unchanged = true;
+ for (int32_t i = 0; i < rangeCount; ++i) {
+ int32_t start = set.getRangeStart(i);
+ int32_t end = set.getRangeEnd(i);
+ int32_t newEnd = start + perRange;
+ if (end > newEnd) {
+ if (unchanged) {
+ copy = set;
+ p = ©
+ unchanged = false;
+ }
+ copy.remove(newEnd + 1, end);
+ }
}
+ return *p;
}
+} // namespace
+
//--------------------------------------------------------------------
// RTTest Interface
//--------------------------------------------------------------------
return;
}
- AbbreviatedUnicodeSetIterator usi;
- AbbreviatedUnicodeSetIterator usi2;
+ UnicodeSetIterator usi;
+ UnicodeSetIterator usi2;
parent->logln("Checking that at least one irrelevant character is not NFC'ed");
// string is from NFC_NO in the UCD
UnicodeSet sourceRangeMinusFailures(sourceRange);
sourceRangeMinusFailures.removeAll(failSourceTarg);
-
- usi.reset(sourceRangeMinusFailures, quickRt, density);
+
+ UnicodeSet copy, copy2;
+ usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy));
for (;;) {
if (!usi.next() || usi.isString()) break;
UChar32 c = usi.getCodepoint();
- usi2.reset(sourceRangeMinusFailures, quickRt, density);
+ usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy2));
for (;;) {
if (!usi2.next() || usi2.isString()) break;
UChar32 d = usi2.getCodepoint();
targetRangeMinusFailures.removeAll(failTargSource);
targetRangeMinusFailures.removeAll(failRound);
- usi.reset(targetRangeMinusFailures, quickRt, density);
+ usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy));
UnicodeString targ2;
UnicodeString reverse2;
UnicodeString targD;
return;
}
- usi2.reset(targetRangeMinusFailures, quickRt, density);
+ usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy2));
for (;;) {
if (!usi2.next() || usi2.isString())
break;
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO(TestDeepPattern);
TESTCASE_AUTO(TestEmptyString);
+ TESTCASE_AUTO(TestSkipToStrings);
TESTCASE_AUTO_END;
}
if (U_FAILURE(ec)) {
errln("FAIL: couldn't construct test sets");
}
+ assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
+ assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
for (int32_t i = 0; testList[i] != NULL; i+=2) {
if (U_SUCCESS(ec)) {
}
delete testList[i];
delete testList[i+1];
- }
+ }
}
/**
assertTrue("frozen containsNone", set.containsNone(u"def"));
assertFalse("frozen containsSome", set.containsSome(u"def"));
}
+
+void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
+ assertTrue(expected + ".next()", iter.next());
+ assertEquals(expected + ".getString()", expected, iter.getString());
+}
+
+void UnicodeSetTest::TestSkipToStrings() {
+ IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
+ UnicodeSet set(u"[0189{}{ch}]", errorCode);
+ UnicodeSetIterator iter(set);
+ assertNext(iter.skipToStrings(), u"");
+ assertNext(iter, u"ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ assertNext(iter, u"0");
+ assertNext(iter, u"1");
+ assertNext(iter, u"8");
+ assertNext(iter, u"9");
+ assertNext(iter, u"");
+ assertNext(iter, u"ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ assertNext(iter, u"0");
+ iter.skipToStrings();
+ assertNext(iter, u"");
+ assertNext(iter, u"ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ iter.nextRange();
+ assertNext(iter, u"8");
+ iter.skipToStrings();
+ assertNext(iter, u"");
+ assertNext(iter, u"ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ iter.nextRange();
+ iter.nextRange();
+ iter.nextRange();
+ iter.skipToStrings();
+ assertNext(iter, u"ch");
+ assertFalse("no next", iter.next());
+}
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/ucnv_err.h"
+#include "unicode/usetiter.h"
#include "intltest.h"
#include "cmemory.h"
void TestDeepPattern();
void TestEmptyString();
+ void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
+ void TestSkipToStrings();
+
private:
UBool toPatternAux(UChar32 start, UChar32 end);
* <li>AUTO: 0.90, 1.00, 1.10
* <li>HIDE_IF_WHOLE: 0.90, 1, 1.10
* </ul>
- *
+ *
* @draft ICU 69
- * @provisional This API might change or be removed in a future release.
*/
public static enum TrailingZeroDisplay {
/**
* @draft ICU 69
*/
AUTO,
-
+
/**
* Same as AUTO, but hide trailing zeros after the decimal separator if they are all zero.
*
}
}
- boolean hasStrings() {
- return !strings.isEmpty();
- }
-
/**
* Returns the number of elements in this set (its cardinality)
* Note than the elements of a set may include both individual
return len == 1 && !hasStrings();
}
+ /**
+ * @return true if this set contains multi-character strings or the empty string.
+ * @draft ICU 70
+ */
+ public boolean hasStrings() {
+ return !strings.isEmpty();
+ }
+
/**
* Implementation of UnicodeMatcher API. Returns <tt>true</tt> if
* this set contains any character whose low byte is the given
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
+ * multicharacter strings of the UnicodeSet, if any.
*
* <p>To iterate over code points and multicharacter strings,
* use a loop like this:
* }
* }
* </pre>
+ *
+ * <p>To iterate over only the strings, start with <code>new UnicodeSetIterator(set).skipToStrings()</code>.
+ *
* <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
* Do not alter the UnicodeSet while iterating.
* @author M. Davis
* @stable ICU 2.0
+ * @see UnicodeSet#ranges()
+ * @see UnicodeSet#strings()
+ * @see UnicodeSet#iterator()
*/
public class UnicodeSetIterator {
reset(new UnicodeSet());
}
+ /**
+ * Skips over the remaining code points/ranges, if any.
+ * A following call to next() or nextRange() will yield a string, if there is one.
+ * No-op if next() would return false, or if it would yield a string anyway.
+ *
+ * @return this
+ * @draft ICU 70
+ * @see UnicodeSet#strings()
+ */
+ public UnicodeSetIterator skipToStrings() {
+ // Finish code point/range iteration.
+ range = endRange;
+ endElement = -1;
+ nextElement = 0;
+ return this;
+ }
+
/**
* Returns the next element in the set, either a single code point
* or a string. If there are no more elements in the set, return
private int endRange = 0;
private int range = 0;
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- public UnicodeSet getSet() {
- return set;
- }
-
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- protected int endElement;
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- protected int nextElement;
- private Iterator<String> stringIterator = null;
+ private int endElement;
+ private int nextElement;
/**
* Invariant: stringIterator is null when there are no (more) strings remaining
*/
+ private Iterator<String> stringIterator = null;
- /**
- * @internal
- * @deprecated This API is ICU internal only.
- */
- @Deprecated
- protected void loadRange(int aRange) {
+ private void loadRange(int aRange) {
nextElement = set.getRangeStart(aRange);
endElement = set.getRangeEnd(aRange);
}
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
};
+ assertFalse("[a-c].hasStrings()", testList[0][0].hasStrings());
+ assertTrue("[{ll}{ch}a-z].hasStrings()", testList[1][0].hasStrings());
for (int i = 0; i < testList.length; ++i) {
if (!testList[i][0].equals(testList[i][1])) {
return Utility.unescape(s);
}
- /* Test the method public UnicodeSet getSet() */
- @Test
- public void TestGetSet() {
- UnicodeSetIterator us = new UnicodeSetIterator();
- try {
- us.getSet();
- } catch (Exception e) {
- errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception.");
- }
- }
-
/* Tests the method public UnicodeSet add(Collection<?> source) */
@Test
public void TestAddCollection() {
assertTrue("frozen containsNone", set.containsNone("def"));
assertFalse("frozen containsSome", set.containsSome("def"));
}
+
+ private void assertNext(UnicodeSetIterator iter, String expected) {
+ assertTrue(expected + ".next()", iter.next());
+ assertEquals(expected + ".getString()", expected, iter.getString());
+ }
+
+ @Test
+ public void TestSkipToStrings() {
+ UnicodeSet set = new UnicodeSet("[0189{}{ch}]");
+ UnicodeSetIterator iter = new UnicodeSetIterator(set).skipToStrings();
+ assertNext(iter, "");
+ assertNext(iter, "ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ assertNext(iter, "0");
+ assertNext(iter, "1");
+ assertNext(iter, "8");
+ assertNext(iter, "9");
+ assertNext(iter, "");
+ assertNext(iter, "ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ assertNext(iter, "0");
+ iter.skipToStrings();
+ assertNext(iter, "");
+ assertNext(iter, "ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ iter.nextRange();
+ assertNext(iter, "8");
+ iter.skipToStrings();
+ assertNext(iter, "");
+ assertNext(iter, "ch");
+ assertFalse("no next", iter.next());
+
+ iter.reset();
+ iter.nextRange();
+ iter.nextRange();
+ iter.nextRange();
+ iter.skipToStrings();
+ assertNext(iter, "ch");
+ assertFalse("no next", iter.next());
+ }
}
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
+ * multicharacter strings of the UnicodeSet, if any.
*
* <p>To iterate over code points, use a loop like this:
* <pre>
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
* string in the <tt>string</tt> field. Otherwise the value is a
* single code point in the <tt>codepoint</tt> field.
- *
+ *
* <p>The order of iteration is all code points in sorted order,
* followed by all strings sorted order. <tt>codepointEnd</tt> is
* undefined after calling this method. <tt>string</tt> is
if (stringIterator == null) return false;
codepoint = IS_STRING; // signal that value is actually a string
- string = (String)stringIterator.next();
+ string = stringIterator.next();
if (!stringIterator.hasNext()) stringIterator = null;
return true;
}
* string in the <tt>string</tt> field. Otherwise the value is a
* range of one or more code points from <tt>codepoint</tt> to
* <tt>codepointeEnd</tt> inclusive.
- *
+ *
* <p>The order of iteration is all code points ranges in sorted
* order, followed by all strings sorted order. Ranges are
* disjoint and non-contiguous. <tt>string</tt> is undefined
if (stringIterator == null) return false;
codepoint = IS_STRING; // signal that value is actually a string
- string = (String)stringIterator.next();
+ string = stringIterator.next();
if (!stringIterator.hasNext()) stringIterator = null;
return true;
}
/**
* Resets this iterator to the start of the set.
- * @return
+ * @return
*/
public UnicodeMapIterator<T> reset() {
endRange = map.getRangeCount() - 1;
// both next*() methods will test: if (nextElement <= endElement)
// we set them to fail this test, which will cause them to load the first range
- nextElement = 0;
+ nextElement = 0;
endElement = -1;
range = -1;
static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
static String HIRAGANA_ITERATION = "[\u309D\u309E]";
- //------------------------------------------------------------------
- // AbbreviatedUnicodeSetIterator
- //------------------------------------------------------------------
-
- static class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
-
- private boolean abbreviated;
- private int perRange;
-
- public AbbreviatedUnicodeSetIterator() {
- super();
- abbreviated = false;
- }
-
- @Override
- public void reset(UnicodeSet newSet) {
- reset(newSet, false);
- }
-
- public void reset(UnicodeSet newSet, boolean abb) {
- reset(newSet, abb, 100);
+ /**
+ * If abbreviated=true, returns a set which only a sampling of the original code points.
+ * density is the approximate total number of code points to returned for the entire set.
+ */
+ private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated, int density) {
+ if (!abbreviated) {
+ return set;
}
-
- public void reset(UnicodeSet newSet, boolean abb, int density) {
- super.reset(newSet);
- abbreviated = abb;
- perRange = newSet.getRangeCount();
- if (perRange != 0) {
- perRange = density / perRange;
- }
+ int rangeCount = set.getRangeCount();
+ int perRange = rangeCount;
+ if (perRange != 0) {
+ perRange = density / perRange;
}
-
- @Override
- protected void loadRange(int myRange) {
- super.loadRange(myRange);
- if (abbreviated && (endElement > nextElement + perRange)) {
- endElement = nextElement + perRange;
+ boolean unchanged = true;
+ for (int i = 0; i < rangeCount; ++i) {
+ int start = set.getRangeStart(i);
+ int end = set.getRangeEnd(i);
+ int newEnd = start + perRange;
+ if (end > newEnd) {
+ if (unchanged) {
+ set = set.cloneAsThawed();
+ unchanged = false;
+ }
+ set.remove(newEnd + 1, end);
}
}
+ return set;
}
//--------------------------------------------------------------------
return false;
}
- AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
- AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();
+ UnicodeSetIterator usi = new UnicodeSetIterator();
+ UnicodeSetIterator usi2 = new UnicodeSetIterator();
Transliterator sourceToTarget;
Transliterator targetToSource;
boolean quickRt = TestFmwk.getExhaustiveness() < 10;
- usi.reset(sourceRangeMinusFailures, quickRt, density);
+ usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
while (usi.next()) {
int c = usi.codepoint;
if (failSourceTarg.get(d)) continue;
*/
TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
- usi2.reset(sourceRangeMinusFailures, quickRt, density);
+ usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
while (usi2.next()) {
int d = usi2.codepoint;
!targetRange.contains(c)) continue;
*/
- usi.reset(targetRangeMinusFailures, quickRt, density);
+ usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
while (usi.next()) {
int c = usi.codepoint;
!targetRange.contains(d)) continue;
*/
TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
- usi2.reset(targetRangeMinusFailures, quickRt, density);
+ usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
while (usi2.next()) {