ICU-21525 UnicodeSet.hasString(), UnicodeSetIterator.skipToStrings() & C API

author Markus Scherer <markus.icu@gmail.com>

Tue, 29 Jun 2021 17:27:09 +0000 (17:27 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 1 Jul 2021 00:44:24 +0000 (00:44 +0000)
author Markus Scherer <markus.icu@gmail.com>
Tue, 29 Jun 2021 17:27:09 +0000 (17:27 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 1 Jul 2021 00:44:24 +0000 (00:44 +0000)
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h

index 8403c4026c3218ac741eacd1d85edfbf8223323c..4ebb5e71a91563d8028c46d686e07bd9307eaddc 100644 (file)
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -771,8 +771,12 @@ public:
       * Note than the elements of a set may include both individual
       * codepoints and strings.
       *
+     * This is slower than getRangeCount() because
+     * it counts the code points of all ranges.
+     *
       * @return the number of elements in this set (its cardinality).
       * @stable ICU 2.0
+     * @see getRangeCount
       */
      virtual int32_t size(void) const;
  
@@ -784,6 +788,14 @@ public:
       */
      virtual UBool isEmpty(void) const;
  
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * @return true if this set contains multi-character strings or the empty string.
+     * @draft ICU 70
+     */
+    UBool hasStrings() const;
+#endif  // U_HIDE_DRAFT_API
+
      /**
       * Returns true if this set contains the given character.
       * This function works faster with a frozen set.
@@ -1064,8 +1076,14 @@ public:
      /**
       * Returns the character at the given index within this set, where
       * the set is ordered by ascending code point.  If the index is
-     * out of range, return (UChar32)-1.  The inverse of this method is
-     * <code>indexOf()</code>.
+     * out of range for characters, returns (UChar32)-1.
+     * The inverse of this method is <code>indexOf()</code>.
+     *
+     * For iteration, this is slower than UnicodeSetIterator or
+     * getRangeCount()/getRangeStart()/getRangeEnd(),
+     * because for each call it skips linearly over <code>index</code>
+     * characters in the ranges.
+     *
       * @param index an index from 0..size()-1
       * @return the character at the given index, or (UChar32)-1.
       * @stable ICU 2.4
@@ -1567,7 +1585,6 @@ private:
      void swapBuffers(void);
  
      UBool allocateStrings(UErrorCode &status);
-    UBool hasStrings() const;
      int32_t stringsSize() const;
      UBool stringsContains(const UnicodeString &s) const;
  
diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h

index 1d0daf9d09892964b8740d45a76e3cc5287021b1..3621230641c0ce4610d21bdfa408bd8ba5fcf775 100644 (file)
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@@ -851,6 +851,16 @@ uset_removeAllStrings(USet* set);
  U_CAPI UBool U_EXPORT2
  uset_isEmpty(const USet* set);
  
+#ifndef U_HIDE_DRAFT_API
+/**
+ * @param set the set
+ * @return true if this set contains multi-character strings or the empty string.
+ * @draft ICU 70
+ */
+U_CAPI UBool U_EXPORT2
+uset_hasStrings(const USet *set);
+#endif  // U_HIDE_DRAFT_API
+
  /**
   * Returns true if the given USet contains the given character.
   * This function works faster with a frozen set.
@@ -901,8 +911,13 @@ uset_indexOf(const USet* set, UChar32 c);
  /**
   * Returns the character at the given index within this set, where
   * the set is ordered by ascending code point.  If the index is
- * out of range, return (UChar32)-1.  The inverse of this method is
- * <code>indexOf()</code>.
+ * out of range for characters, returns (UChar32)-1.
+ * The inverse of this method is <code>indexOf()</code>.
+ *
+ * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
+ * with uset_getItem(), because for each call it skips linearly over <code>index</code>
+ * characters in the ranges.
+ *
   * @param set the set
   * @param charIndex an index from 0..size()-1 to obtain the char for
   * @return the character at the given index, or (UChar32)-1.
@@ -912,16 +927,34 @@ U_CAPI UChar32 U_EXPORT2
  uset_charAt(const USet* set, int32_t charIndex);
  
  /**
- * Returns the number of characters and strings contained in the given
- * USet.
+ * Returns the number of characters and strings contained in this set.
+ * The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
+ *
+ * This is slower than uset_getRangeCount() and uset_getItemCount() because
+ * it counts the code points of all ranges.
+ *
   * @param set the set
   * @return a non-negative integer counting the characters and strings
   * contained in set
   * @stable ICU 2.4
+ * @see uset_getRangeCount
   */
  U_CAPI int32_t U_EXPORT2
  uset_size(const USet* set);
  
+#ifndef U_HIDE_DRAFT_API
+/**
+ * @param set the set
+ * @return the number of ranges in this set.
+ * @draft ICU 70
+ * @see uset_getItemCount
+ * @see uset_getItem
+ * @see uset_size
+ */
+U_CAPI int32_t U_EXPORT2
+uset_getRangeCount(const USet *set);
+#endif  // U_HIDE_DRAFT_API
+
  /**
   * Returns the number of items in this set.  An item is either a range
   * of characters or a single multicharacter string.
@@ -935,20 +968,30 @@ uset_getItemCount(const USet* set);
  
  /**
   * Returns an item of this set.  An item is either a range of
- * characters or a single multicharacter string.
+ * characters or a single multicharacter string (which can be the empty string).
+ *
+ * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
+ * and the range is <code>*start</code>..<code>*end</code>.
+ *
+ * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
+ * this function copies the string into <code>str[strCapacity]</code> and
+ * returns the length of the string (0 for the empty string).
+ *
+ * If <code>itemIndex</code> is out of range, then this function returns -1.
+ *
+ * Note that 0 is returned for each range as well as for the empty string.
+ *
   * @param set the set
- * @param itemIndex a non-negative integer in the range 0..
- * uset_getItemCount(set)-1
- * @param start pointer to variable to receive first character
- * in range, inclusive
- * @param end pointer to variable to receive last character in range,
- * inclusive
+ * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
+ * @param start pointer to variable to receive first character in range, inclusive;
+ *              can be NULL for a string item
+ * @param end pointer to variable to receive last character in range, inclusive;
+ *            can be NULL for a string item
   * @param str buffer to receive the string, may be NULL
   * @param strCapacity capacity of str, or 0 if str is NULL
- * @param ec error code
- * @return the length of the string (>= 2), or 0 if the item is a
- * range, in which case it is the range *start..*end, or -1 if
- * itemIndex is out of range
+ * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
+ * @return the length of the string (0 or >= 2), or 0 if the item is a range,
+ *         or -1 if the itemIndex is out of range
   * @stable ICU 2.4
   */
  U_CAPI int32_t U_EXPORT2
diff --git a/icu4c/source/common/unicode/usetiter.h b/icu4c/source/common/unicode/usetiter.h

index a817ef72b3ae5ef5108488bb39af1dd2dbb63599..831565415db3cb9f1856ca5c37010cbadd052ec8 100644 (file)
--- a/icu4c/source/common/unicode/usetiter.h
+++ b/icu4c/source/common/unicode/usetiter.h
@@ -60,6 +60,9 @@ class UnicodeString;
   *   }
   * }
   * </pre>
+ *
+ * To iterate over only the strings, start with <code>skipToStrings()</code>.
+ *
   * @author M. Davis
   * @stable ICU 2.4
   */
@@ -170,6 +173,25 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
       */
      const UnicodeString& getString();
  
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Skips over the remaining code points/ranges, if any.
+     * A following call to next() or nextRange() will yield a string, if there is one.
+     * No-op if next() would return false, or if it would yield a string anyway.
+     *
+     * @return *this
+     * @draft ICU 70
+     * @see UnicodeSet#strings()
+     */
+    inline UnicodeSetIterator &skipToStrings() {
+        // Finish code point/range iteration.
+        range = endRange;
+        endElement = -1;
+        nextElement = 0;
+        return *this;
+    }
+#endif  // U_HIDE_DRAFT_API
+
      /**
       * Advances the iteration position to the next element in the set, 
       * which can be either a single code point or a string.  
@@ -281,13 +303,16 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
       */
      int32_t stringCount;
  
+ private:
+
      /**
       *  Points to the string to use when the caller asks for a
       *  string and the current iteration item is a code point, not a string.
-     *  @internal
       */
      UnicodeString *cpString;
  
+ protected:
+
      /** Copy constructor. Disallowed.
       * @stable ICU 2.4
       */
@@ -306,7 +331,7 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
  };
  
  inline UBool UnicodeSetIterator::isString() const {
-    return codepoint == (UChar32)IS_STRING;
+    return codepoint < 0;
  }
  
  inline UChar32 UnicodeSetIterator::getCodepoint() const {
diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp

index a7e3046dbf2b1412e42b86c1add677bb2a674e3c..c131fd91e13e1e8538e935e3aae960439da61393 100644 (file)
--- a/icu4c/source/common/uset.cpp
+++ b/icu4c/source/common/uset.cpp
@@ -196,6 +196,11 @@ uset_isEmpty(const USet* set) {
      return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
  }
  
+U_CAPI UBool U_EXPORT2
+uset_hasStrings(const USet* set) {
+    return ((const UnicodeSet*) set)->UnicodeSet::hasStrings();
+}
+
  U_CAPI UBool U_EXPORT2
  uset_contains(const USet* set, UChar32 c) {
      return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
@@ -296,6 +301,11 @@ private:
  };
  U_NAMESPACE_END
  
+U_CAPI int32_t U_EXPORT2
+uset_getRangeCount(const USet *set) {
+    return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount();
+}
+
  U_CAPI int32_t U_EXPORT2
  uset_getItemCount(const USet* uset) {
      const UnicodeSet& set = *(const UnicodeSet*)uset;
@@ -330,11 +340,6 @@ uset_getItem(const USet* uset, int32_t itemIndex,
      }
  }
  
-//U_CAPI int32_t U_EXPORT2
-//uset_getRangeCount(const USet* set) {
-//    return ((const UnicodeSet*) set)->getRangeCount();
-//}
-//
  //U_CAPI UBool U_EXPORT2
  //uset_getRange(const USet* set, int32_t rangeIndex,
  //              UChar32* pStart, UChar32* pEnd) {
diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c

index 9fe2362fa220ba9babb4db754d4d8d29eeb5f082..87f8fc2b38b5af2e185f3165841f6cc619eda984 100644 (file)
--- a/icu4c/source/test/cintltst/usettest.c
+++ b/icu4c/source/test/cintltst/usettest.c
@@ -6,12 +6,15 @@
  * Corporation and others.  All Rights Reserved.
  **********************************************************************
  */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
  #include "unicode/uset.h"
  #include "unicode/ustring.h"
  #include "cintltst.h"
  #include "cmemory.h"
-#include <stdlib.h>
-#include <string.h>
  
  #define TEST(x) addTest(root, &x, "uset/" # x)
  
@@ -101,6 +104,9 @@ static void TestAPI() {
      /* [ABC] */
      set = uset_open(0x0041, 0x0043);
      expect(set, "ABC", "DEF{ab}", NULL);
+    if(uset_hasStrings(set)) {
+        log_err("uset_hasStrings([ABC]) = true");
+    }
      uset_close(set);
  
      /* [a-c{ab}] */
@@ -113,6 +119,9 @@ static void TestAPI() {
      if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
          log_err("uset_resemblesPattern of PAT failed\n");
      }
+    if(!uset_hasStrings(set)) {
+        log_err("uset_hasStrings([a-c{ab}]) = false");
+    }
      expect(set, "abc{ab}", "def{bc}", &ec);
  
      /* [a-d{ab}] */
@@ -167,6 +176,9 @@ static void TestAPI() {
          return;
      }
      expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
+    if (uset_size(set) != 22 || uset_getRangeCount(set) != 3 || uset_getItemCount(set) != 3) {
+        log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
+    }
  
      /* [ab] */
      uset_clear(set);
@@ -243,6 +255,9 @@ static void TestAPI() {
          return;
      }
      expect(set, "abcdef{ch}{sch}", "", NULL);
+    if (uset_size(set) != 8 || uset_getRangeCount(set) != 1 || uset_getItemCount(set) != 3) {
+        log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
+    }
  
      uset_retainString(set, u"sch", 3);
      expect(set, "{sch}", "abcdef{ch}", NULL);
@@ -400,10 +415,12 @@ static void expectItems(const USet* set,
      char *pat;
      UErrorCode ec;
      int32_t expectedSize = 0;
+    int32_t rangeCount = uset_getRangeCount(set);
      int32_t itemCount = uset_getItemCount(set);
      int32_t itemIndex = 0;
      UChar32 start = 1, end = 0;
      int32_t itemLen = 0, length;
+    bool isString = false;
  
      ec = U_ZERO_ERROR;
      length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
@@ -435,17 +452,26 @@ static void expectItems(const USet* set,
                  return;
              }
  
-            itemLen = uset_getItem(set, itemIndex, &start, &end,
-                                   itemStr, sizeof(itemStr), &ec);
+            // Pass in NULL pointers where we expect them to be ok.
+            if (itemIndex < rangeCount) {
+                itemLen = uset_getItem(set, itemIndex, &start, &end, NULL, 0, &ec);
+            } else {
+                itemLen = uset_getItem(set, itemIndex, NULL, NULL,
+                                       itemStr, UPRV_LENGTHOF(itemStr), &ec);
+                isString = true;
+            }
              if (U_FAILURE(ec) || itemLen < 0) {
                  log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
                  return;
              }
  
-            if (itemLen == 0) {
+            if (!isString) {
                  log_verbose("Ok: %s item %d is %c-%c\n", pat,
                              itemIndex, oneUCharToChar(start),
                              oneUCharToChar(end));
+                if (itemLen != 0) {
+                    log_err("FAIL: uset_getItem(%d) => length %d\n", itemIndex, itemLen);
+                }
              } else {
                  itemStr[itemLen] = 0;
                  u_UCharsToChars(itemStr, buf, itemLen+1);
@@ -469,7 +495,7 @@ static void expectItems(const USet* set,
              u_charsToUChars(stringStart, ustr, stringLength);
              ustr[stringLength] = 0;
              
-            if (itemLen == 0) {
+            if (!isString) {
                  log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
                          pat, strCopy);
                  return;
@@ -488,18 +514,19 @@ static void expectItems(const USet* set,
              u_charsToUChars(p, ustr, 1);
              c = ustr[0];
  
-            if (itemLen != 0) {
+            if (isString) {
                  log_err("FAIL: for %s expect '%c' next, but got a string\n",
                          pat, *p);
                  return;
              }
  
-            if (c != start++) {
+            if (c != start) {
                  log_err("FAIL: for %s expect '%c' next\n",
                          pat, *p);
                  return;
              }
  
+            ++start;
              ++p;
          }
      }
diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp

index d285f0ae0c747c1ccb104fbc215a6f68d4f5188d..69e12de2f7e1342d2a6e0e8ba665bcfaaf5fc8b7 100644 (file)
--- a/icu4c/source/test/intltest/csdetest.cpp
+++ b/icu4c/source/test/intltest/csdetest.cpp
@@ -780,7 +780,7 @@ void CharsetDetectionTest::Ticket6394Test() {
          return;
      }
  
-    UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
+    UnicodeSet  setOfCharsetNames;    // UnicodeSets can hold strings.
      int32_t i;
      for (i=0; i<matchCount; i++) {
          UnicodeString charSetName(ucsdet_getName(matches[i], &status));
diff --git a/icu4c/source/test/intltest/transrt.cpp b/icu4c/source/test/intltest/transrt.cpp

index c1c30d3adcd8dcb7aa0f202c75ceaac0985bd27d..5120fd8cd63983a99c15153f51a94a1803ab6728 100644 (file)
--- a/icu4c/source/test/intltest/transrt.cpp
+++ b/icu4c/source/test/intltest/transrt.cpp
@@ -262,68 +262,42 @@ UBool LegalGreek::isRho(UChar c) {
      return FALSE;
  }
  
-// AbbreviatedUnicodeSetIterator Interface ---------------------------------------------
-//
-//      Iterate over a UnicodeSet, only returning a sampling of the contained code points.
-//        density is the approximate total number of code points to returned for the entire set.
-//
+namespace {
  
-class AbbreviatedUnicodeSetIterator : public UnicodeSetIterator {
-public :
-
-    AbbreviatedUnicodeSetIterator();
-    virtual ~AbbreviatedUnicodeSetIterator();
-    void reset(UnicodeSet& set, UBool abb = FALSE, int32_t density = 100);
-
-    /**
-     * ICU "poor man's RTTI", returns a UClassID for this class.
-     */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
-
-    /**
-     * ICU "poor man's RTTI", returns a UClassID for the actual class.
-     */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
-
-private :
-    UBool abbreviated;
-    int32_t perRange;           // The maximum number of code points to be returned from each range
-    virtual void loadRange(int32_t range);
-
-    /**
-     * The address of this static class variable serves as this class's ID
-     * for ICU "poor man's RTTI".
-     */
-    static const char fgClassID;
-};
-
-// AbbreviatedUnicodeSetIterator Implementation ---------------------------------------
-
-const char AbbreviatedUnicodeSetIterator::fgClassID=0;
-
-AbbreviatedUnicodeSetIterator::AbbreviatedUnicodeSetIterator() :
-    UnicodeSetIterator(), abbreviated(FALSE) {
-}
-
-AbbreviatedUnicodeSetIterator::~AbbreviatedUnicodeSetIterator() {
-}
-        
-void AbbreviatedUnicodeSetIterator::reset(UnicodeSet& newSet, UBool abb, int32_t density) {
-    UnicodeSetIterator::reset(newSet);
-    abbreviated = abb;
-    perRange = newSet.getRangeCount();
+/**
+ * If abbreviated=true, returns a set which only a sampling of the original code points.
+ * density is the approximate total number of code points to returned for the entire set.
+ */
+const UnicodeSet &abbreviateSet(const UnicodeSet &set, bool abbreviated, int density,
+                                UnicodeSet &copy) {
+    if (!abbreviated) {
+        return set;
+    }
+    int32_t rangeCount = set.getRangeCount();
+    int32_t perRange = rangeCount;
      if (perRange != 0) {
          perRange = density / perRange;
      }
-}
-
-void AbbreviatedUnicodeSetIterator::loadRange(int32_t myRange) {
-    UnicodeSetIterator::loadRange(myRange);
-    if (abbreviated && (endElement > nextElement + perRange)) {
-        endElement = nextElement + perRange;
+    const UnicodeSet *p = &set;
+    bool unchanged = true;
+    for (int32_t i = 0; i < rangeCount; ++i) {
+        int32_t start = set.getRangeStart(i);
+        int32_t end = set.getRangeEnd(i);
+        int32_t newEnd = start + perRange;
+        if (end > newEnd) {
+            if (unchanged) {
+                copy = set;
+                p = &copy;
+                unchanged = false;
+            }
+            copy.remove(newEnd + 1, end);
+        }
      }
+    return *p;
  }
  
+}  // namespace
+
  //--------------------------------------------------------------------
  // RTTest Interface
  //--------------------------------------------------------------------
@@ -587,8 +561,8 @@ void RTTest::test2(UBool quickRt, int32_t density) {
          return;
      }
  
-    AbbreviatedUnicodeSetIterator usi;
-    AbbreviatedUnicodeSetIterator usi2;
+    UnicodeSetIterator usi;
+    UnicodeSetIterator usi2;
  
      parent->logln("Checking that at least one irrelevant character is not NFC'ed");
      // string is from NFC_NO in the UCD
@@ -702,13 +676,14 @@ void RTTest::test2(UBool quickRt, int32_t density) {
  
      UnicodeSet sourceRangeMinusFailures(sourceRange);
      sourceRangeMinusFailures.removeAll(failSourceTarg);
-            
-    usi.reset(sourceRangeMinusFailures, quickRt, density);
+
+    UnicodeSet copy, copy2;
+    usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy));
      for (;;) { 
          if (!usi.next() || usi.isString()) break;
          UChar32 c = usi.getCodepoint();
               
-        usi2.reset(sourceRangeMinusFailures, quickRt, density);
+        usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy2));
          for (;;) {
              if (!usi2.next() || usi2.isString()) break;
              UChar32 d = usi2.getCodepoint();
@@ -816,7 +791,7 @@ void RTTest::test2(UBool quickRt, int32_t density) {
      targetRangeMinusFailures.removeAll(failTargSource);
      targetRangeMinusFailures.removeAll(failRound);
  
-    usi.reset(targetRangeMinusFailures, quickRt, density);
+    usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy));
      UnicodeString targ2;
      UnicodeString reverse2;
      UnicodeString targD;
@@ -830,7 +805,7 @@ void RTTest::test2(UBool quickRt, int32_t density) {
              return;
          }
  
-        usi2.reset(targetRangeMinusFailures, quickRt, density);
+        usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy2));
          for (;;) {
              if (!usi2.next() || usi2.isString())
                  break;
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp

index a19f229a3a773c9a1d93586dc11f2470b688850b..56bdc6296516139d899c608f83c64d9084f9a85c 100644 (file)
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -99,6 +99,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
      TESTCASE_AUTO(TestUnusedCcc);
      TESTCASE_AUTO(TestDeepPattern);
      TESTCASE_AUTO(TestEmptyString);
+    TESTCASE_AUTO(TestSkipToStrings);
      TESTCASE_AUTO_END;
  }
  
@@ -882,6 +883,8 @@ void UnicodeSetTest::TestStrings() {
      if (U_FAILURE(ec)) {
          errln("FAIL: couldn't construct test sets");
      }
+    assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
+    assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
  
      for (int32_t i = 0; testList[i] != NULL; i+=2) {
          if (U_SUCCESS(ec)) {
@@ -896,7 +899,7 @@ void UnicodeSetTest::TestStrings() {
          }
          delete testList[i];
          delete testList[i+1];
-    }        
+    }
  }
  
  /**
@@ -4059,3 +4062,49 @@ void UnicodeSetTest::TestEmptyString() {
      assertTrue("frozen containsNone", set.containsNone(u"def"));
      assertFalse("frozen containsSome", set.containsSome(u"def"));
  }
+
+void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
+    assertTrue(expected + ".next()", iter.next());
+    assertEquals(expected + ".getString()", expected, iter.getString());
+}
+
+void UnicodeSetTest::TestSkipToStrings() {
+    IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
+    UnicodeSet set(u"[0189{}{ch}]", errorCode);
+    UnicodeSetIterator iter(set);
+    assertNext(iter.skipToStrings(), u"");
+    assertNext(iter, u"ch");
+    assertFalse("no next", iter.next());
+
+    iter.reset();
+    assertNext(iter, u"0");
+    assertNext(iter, u"1");
+    assertNext(iter, u"8");
+    assertNext(iter, u"9");
+    assertNext(iter, u"");
+    assertNext(iter, u"ch");
+    assertFalse("no next", iter.next());
+
+    iter.reset();
+    assertNext(iter, u"0");
+    iter.skipToStrings();
+    assertNext(iter, u"");
+    assertNext(iter, u"ch");
+    assertFalse("no next", iter.next());
+
+    iter.reset();
+    iter.nextRange();
+    assertNext(iter, u"8");
+    iter.skipToStrings();
+    assertNext(iter, u"");
+    assertNext(iter, u"ch");
+    assertFalse("no next", iter.next());
+
+    iter.reset();
+    iter.nextRange();
+    iter.nextRange();
+    iter.nextRange();
+    iter.skipToStrings();
+    assertNext(iter, u"ch");
+    assertFalse("no next", iter.next());
+}
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h

index 7d2816a3d990c59020c9adcd066e06628f58ed99..44d1f02dbb68d70d78634f4c426e37413e5d76fe 100644 (file)
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -19,6 +19,7 @@
  #include "unicode/unistr.h"
  #include "unicode/uniset.h"
  #include "unicode/ucnv_err.h"
+#include "unicode/usetiter.h"
  #include "intltest.h"
  #include "cmemory.h"
  
@@ -96,6 +97,9 @@ private:
      void TestDeepPattern();
      void TestEmptyString();
  
+    void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
+    void TestSkipToStrings();
+
  private:
  
      UBool toPatternAux(UChar32 start, UChar32 end);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java

index 7a1f6c0b48e54873ec376cb98e01db52f1592f06..715c0f28ef6b226c5b6147f89aa1ff5a156d4ee5 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java
@@ -487,9 +487,8 @@ public final class NumberFormatter {
       * <li>AUTO: 0.90, 1.00, 1.10
       * <li>HIDE_IF_WHOLE: 0.90, 1, 1.10
       * </ul>
-     * 
+     *
       * @draft ICU 69
-     * @provisional This API might change or be removed in a future release.
       */
      public static enum TrailingZeroDisplay {
          /**
@@ -498,7 +497,7 @@ public final class NumberFormatter {
           * @draft ICU 69
           */
          AUTO,
-    
+
          /**
           * Same as AUTO, but hide trailing zeros after the decimal separator if they are all zero.
           *
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index d41ff99ea807f23943b68d54da796b703d3cbb97..4053242b87b93bbf7f59d17249fa28c1f924da13 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -829,10 +829,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          }
      }
  
-    boolean hasStrings() {
-        return !strings.isEmpty();
-    }
-
      /**
       * Returns the number of elements in this set (its cardinality)
       * Note than the elements of a set may include both individual
@@ -860,6 +856,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          return len == 1 && !hasStrings();
      }
  
+    /**
+     * @return true if this set contains multi-character strings or the empty string.
+     * @draft ICU 70
+     */
+    public boolean hasStrings() {
+        return !strings.isEmpty();
+    }
+
      /**
       * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
       * this set contains any character whose low byte is the given
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java

index 2780104485edfc1af09090a74eeb63b11c2a7568..88f0c9cd6358aabf17d3b012580752c32494f8a9 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java
@@ -14,7 +14,7 @@ import java.util.Iterator;
   * UnicodeSetIterator iterates over the contents of a UnicodeSet.  It
   * iterates over either code points or code point ranges.  After all
   * code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
+ * multicharacter strings of the UnicodeSet, if any.
   *
   * <p>To iterate over code points and multicharacter strings,
   * use a loop like this:
@@ -34,10 +34,16 @@ import java.util.Iterator;
   *   }
   * }
   * </pre>
+ *
+ * <p>To iterate over only the strings, start with <code>new UnicodeSetIterator(set).skipToStrings()</code>.
+ *
   * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
   * Do not alter the UnicodeSet while iterating.
   * @author M. Davis
   * @stable ICU 2.0
+ * @see UnicodeSet#ranges()
+ * @see UnicodeSet#strings()
+ * @see UnicodeSet#iterator()
   */
  public class UnicodeSetIterator {
  
@@ -94,6 +100,23 @@ public class UnicodeSetIterator {
          reset(new UnicodeSet());
      }
  
+    /**
+     * Skips over the remaining code points/ranges, if any.
+     * A following call to next() or nextRange() will yield a string, if there is one.
+     * No-op if next() would return false, or if it would yield a string anyway.
+     *
+     * @return this
+     * @draft ICU 70
+     * @see UnicodeSet#strings()
+     */
+    public UnicodeSetIterator skipToStrings() {
+        // Finish code point/range iteration.
+        range = endRange;
+        endElement = -1;
+        nextElement = 0;
+        return this;
+    }
+
      /**
       * Returns the next element in the set, either a single code point
       * or a string.  If there are no more elements in the set, return
@@ -234,39 +257,15 @@ public class UnicodeSetIterator {
      private int endRange = 0;
      private int range = 0;
  
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    public UnicodeSet getSet() {
-        return set;
-    }
-
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    protected int endElement;
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    protected int nextElement;
-    private Iterator<String> stringIterator = null;
+    private int endElement;
+    private int nextElement;
  
      /**
       * Invariant: stringIterator is null when there are no (more) strings remaining
       */
+    private Iterator<String> stringIterator = null;
  
-    /**
-     * @internal
-     * @deprecated This API is ICU internal only.
-     */
-    @Deprecated
-    protected void loadRange(int aRange) {
+    private void loadRange(int aRange) {
          nextElement = set.getRangeStart(aRange);
          endElement = set.getRangeEnd(aRange);
      }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

index f3c1b0fe08812884f39c5fbbc04fee6c39f0e330..0edc7fd6b06efd4ef26956b8e2f4aac09c96e1ac 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -819,6 +819,8 @@ public class UnicodeSetTest extends TestFmwk {
                              {new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
                                  new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
          };
+        assertFalse("[a-c].hasStrings()", testList[0][0].hasStrings());
+        assertTrue("[{ll}{ch}a-z].hasStrings()", testList[1][0].hasStrings());
  
          for (int i = 0; i < testList.length; ++i) {
              if (!testList[i][0].equals(testList[i][1])) {
@@ -2420,17 +2422,6 @@ public class UnicodeSetTest extends TestFmwk {
          return Utility.unescape(s);
      }
  
-    /* Test the method public UnicodeSet getSet() */
-    @Test
-    public void TestGetSet() {
-        UnicodeSetIterator us = new UnicodeSetIterator();
-        try {
-            us.getSet();
-        } catch (Exception e) {
-            errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception.");
-        }
-    }
-
      /* Tests the method public UnicodeSet add(Collection<?> source) */
      @Test
      public void TestAddCollection() {
@@ -2840,4 +2831,50 @@ public class UnicodeSetTest extends TestFmwk {
          assertTrue("frozen containsNone", set.containsNone("def"));
          assertFalse("frozen containsSome", set.containsSome("def"));
      }
+
+    private void assertNext(UnicodeSetIterator iter, String expected) {
+        assertTrue(expected + ".next()", iter.next());
+        assertEquals(expected + ".getString()", expected, iter.getString());
+    }
+
+    @Test
+    public void TestSkipToStrings() {
+        UnicodeSet set = new UnicodeSet("[0189{}{ch}]");
+        UnicodeSetIterator iter = new UnicodeSetIterator(set).skipToStrings();
+        assertNext(iter, "");
+        assertNext(iter, "ch");
+        assertFalse("no next", iter.next());
+
+        iter.reset();
+        assertNext(iter, "0");
+        assertNext(iter, "1");
+        assertNext(iter, "8");
+        assertNext(iter, "9");
+        assertNext(iter, "");
+        assertNext(iter, "ch");
+        assertFalse("no next", iter.next());
+
+        iter.reset();
+        assertNext(iter, "0");
+        iter.skipToStrings();
+        assertNext(iter, "");
+        assertNext(iter, "ch");
+        assertFalse("no next", iter.next());
+
+        iter.reset();
+        iter.nextRange();
+        assertNext(iter, "8");
+        iter.skipToStrings();
+        assertNext(iter, "");
+        assertNext(iter, "ch");
+        assertFalse("no next", iter.next());
+
+        iter.reset();
+        iter.nextRange();
+        iter.nextRange();
+        iter.nextRange();
+        iter.skipToStrings();
+        assertNext(iter, "ch");
+        assertFalse("no next", iter.next());
+    }
  }
diff --git a/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java b/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java

index ba32ab007f734a02a3a915d8c73a37c143ecdbd8..657e653db98cbb0bbd0da49b4476ba8a22bd145e 100644 (file)
--- a/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java
+++ b/icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java
@@ -17,7 +17,7 @@ import com.ibm.icu.text.UTF16;
   * UnicodeSetIterator iterates over the contents of a UnicodeSet.  It
   * iterates over either code points or code point ranges.  After all
   * code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
+ * multicharacter strings of the UnicodeSet, if any.
   *
   * <p>To iterate over code points, use a loop like this:
   * <pre>
@@ -106,7 +106,7 @@ public class UnicodeMapIterator<T> {
       * false.  If <tt>codepoint == IS_STRING</tt>, the value is a
       * string in the <tt>string</tt> field.  Otherwise the value is a
       * single code point in the <tt>codepoint</tt> field.
-     * 
+     *
       * <p>The order of iteration is all code points in sorted order,
       * followed by all strings sorted order.  <tt>codepointEnd</tt> is
       * undefined after calling this method.  <tt>string</tt> is
@@ -135,7 +135,7 @@ public class UnicodeMapIterator<T> {
  
          if (stringIterator == null) return false;
          codepoint = IS_STRING; // signal that value is actually a string
-        string = (String)stringIterator.next();
+        string = stringIterator.next();
          if (!stringIterator.hasNext()) stringIterator = null;
          return true;
      }
@@ -147,7 +147,7 @@ public class UnicodeMapIterator<T> {
       * string in the <tt>string</tt> field.  Otherwise the value is a
       * range of one or more code points from <tt>codepoint</tt> to
       * <tt>codepointeEnd</tt> inclusive.
-     * 
+     *
       * <p>The order of iteration is all code points ranges in sorted
       * order, followed by all strings sorted order.  Ranges are
       * disjoint and non-contiguous.  <tt>string</tt> is undefined
@@ -180,7 +180,7 @@ public class UnicodeMapIterator<T> {
  
          if (stringIterator == null) return false;
          codepoint = IS_STRING; // signal that value is actually a string
-        string = (String)stringIterator.next();
+        string = stringIterator.next();
          if (!stringIterator.hasNext()) stringIterator = null;
          return true;
      }
@@ -198,13 +198,13 @@ public class UnicodeMapIterator<T> {
  
      /**
       * Resets this iterator to the start of the set.
-     * @return 
+     * @return
       */
      public UnicodeMapIterator<T> reset() {
          endRange = map.getRangeCount() - 1;
          // both next*() methods will test: if (nextElement <= endElement)
          // we set them to fail this test, which will cause them to load the first range
-        nextElement = 0; 
+        nextElement = 0;
          endElement = -1;
          range = -1;
  
diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java

index 2cbf17310a6f9e0152e4354c396822119f5bb3ee..8284cd09fe7980715b4cfee68700e41cea82539c 100644 (file)
--- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
+++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java
@@ -77,45 +77,33 @@ public class RoundTripTest extends TestFmwk {
      static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
      static String HIRAGANA_ITERATION = "[\u309D\u309E]";
  
-    //------------------------------------------------------------------
-    // AbbreviatedUnicodeSetIterator
-    //------------------------------------------------------------------
-
-    static class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
-
-        private boolean abbreviated;
-        private int perRange;
-
-        public AbbreviatedUnicodeSetIterator() {
-            super();
-            abbreviated = false;
-        }
-
-        @Override
-        public void reset(UnicodeSet newSet) {
-            reset(newSet, false);
-        }
-
-        public void reset(UnicodeSet newSet, boolean abb) {
-            reset(newSet, abb, 100);
+    /**
+     * If abbreviated=true, returns a set which only a sampling of the original code points.
+     * density is the approximate total number of code points to returned for the entire set.
+     */
+    private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated, int density) {
+        if (!abbreviated) {
+            return set;
          }
-
-        public void reset(UnicodeSet newSet, boolean abb, int density) {
-            super.reset(newSet);
-            abbreviated = abb;
-            perRange = newSet.getRangeCount();
-            if (perRange != 0) {
-                perRange = density / perRange;
-            }
+        int rangeCount = set.getRangeCount();
+        int perRange = rangeCount;
+        if (perRange != 0) {
+            perRange = density / perRange;
          }
-
-        @Override
-        protected void loadRange(int myRange) {
-            super.loadRange(myRange);
-            if (abbreviated && (endElement > nextElement + perRange)) {
-                endElement = nextElement + perRange;
+        boolean unchanged = true;
+        for (int i = 0; i < rangeCount; ++i) {
+            int start = set.getRangeStart(i);
+            int end = set.getRangeEnd(i);
+            int newEnd = start + perRange;
+            if (end > newEnd) {
+                if (unchanged) {
+                    set = set.cloneAsThawed();
+                    unchanged = false;
+                }
+                set.remove(newEnd + 1, end);
              }
          }
+        return set;
      }
  
      //--------------------------------------------------------------------
@@ -1295,8 +1283,8 @@ public class RoundTripTest extends TestFmwk {
              return false;
          }
  
-        AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
-        AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();
+        UnicodeSetIterator usi = new UnicodeSetIterator();
+        UnicodeSetIterator usi2 = new UnicodeSetIterator();
  
          Transliterator sourceToTarget;
          Transliterator targetToSource;
@@ -1454,7 +1442,7 @@ public class RoundTripTest extends TestFmwk {
  
              boolean quickRt = TestFmwk.getExhaustiveness() < 10;
  
-            usi.reset(sourceRangeMinusFailures, quickRt, density);
+            usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
  
              while (usi.next()) {
                  int c = usi.codepoint;
@@ -1466,7 +1454,7 @@ public class RoundTripTest extends TestFmwk {
                      if (failSourceTarg.get(d)) continue;
                   */
                  TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
-                usi2.reset(sourceRangeMinusFailures, quickRt, density);
+                usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
  
                  while (usi2.next()) {
                      int d = usi2.codepoint;
@@ -1561,7 +1549,7 @@ public class RoundTripTest extends TestFmwk {
                      !targetRange.contains(c)) continue;
               */
  
-            usi.reset(targetRangeMinusFailures, quickRt, density);
+            usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
  
              while (usi.next()) {
                  int c = usi.codepoint;
@@ -1574,7 +1562,7 @@ public class RoundTripTest extends TestFmwk {
                          !targetRange.contains(d)) continue;
                   */
                  TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
-                usi2.reset(targetRangeMinusFailures, quickRt, density);
+                usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
  
                  while (usi2.next()) {
author	Markus Scherer <markus.icu@gmail.com>
	Tue, 29 Jun 2021 17:27:09 +0000 (17:27 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 1 Jul 2021 00:44:24 +0000 (00:44 +0000)
icu4c/source/common/unicode/uniset.h		patch \| blob \| history
icu4c/source/common/unicode/uset.h		patch \| blob \| history
icu4c/source/common/unicode/usetiter.h		patch \| blob \| history
icu4c/source/common/uset.cpp		patch \| blob \| history
icu4c/source/test/cintltst/usettest.c		patch \| blob \| history
icu4c/source/test/intltest/csdetest.cpp		patch \| blob \| history
icu4c/source/test/intltest/transrt.cpp		patch \| blob \| history
icu4c/source/test/intltest/usettest.cpp		patch \| blob \| history
icu4c/source/test/intltest/usettest.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/number/NumberFormatter.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetIterator.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java		patch \| blob \| history
icu4j/main/tests/framework/src/com/ibm/icu/dev/util/UnicodeMapIterator.java		patch \| blob \| history
icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java		patch \| blob \| history