ICU-6065 UnicodeSet::closeOver(simple case folding)

author Markus Scherer <markus.icu@gmail.com>

Thu, 2 Mar 2023 00:25:11 +0000 (00:25 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 2 Mar 2023 16:12:57 +0000 (08:12 -0800)
author Markus Scherer <markus.icu@gmail.com>
Thu, 2 Mar 2023 00:25:11 +0000 (00:25 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 2 Mar 2023 16:12:57 +0000 (08:12 -0800)
diff --git a/icu4c/source/common/characterproperties.cpp b/icu4c/source/common/characterproperties.cpp

index 470e050479fe5e924445c14fa869a4a2d91c0b00..978e6761cee32221af44dc2aace9831022b110f3 100644 (file)
--- a/icu4c/source/common/characterproperties.cpp
+++ b/icu4c/source/common/characterproperties.cpp
@@ -377,22 +377,30 @@ UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
  
  }  // namespace
  
-U_NAMESPACE_USE
+U_NAMESPACE_BEGIN
  
-U_CAPI const USet * U_EXPORT2
-u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
-    if (U_FAILURE(*pErrorCode)) { return nullptr; }
+const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
      if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
-        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
          return nullptr;
      }
      Mutex m(&cpMutex);
      UnicodeSet *set = sets[property];
      if (set == nullptr) {
-        sets[property] = set = makeSet(property, *pErrorCode);
+        sets[property] = set = makeSet(property, errorCode);
      }
-    if (U_FAILURE(*pErrorCode)) { return nullptr; }
-    return set->toUSet();
+    return set;
+}
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
+    const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
+    return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
  }
  
  U_CAPI const UCPMap * U_EXPORT2
diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp

index de5e046fb0318db7c74705db78508a98f3aedda5..392e1266ae4429c4ef76c9d3e02b67502c9daa3e 100644 (file)
--- a/icu4c/source/common/ucase.cpp
+++ b/icu4c/source/common/ucase.cpp
@@ -205,37 +205,7 @@ static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
  
  U_CFUNC void U_EXPORT2
  ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
-    uint16_t props;
-
-    /*
-     * Hardcode the case closure of i and its relatives and ignore the
-     * data file data for these characters.
-     * The Turkic dotless i and dotted I with their case mapping conditions
-     * and case folding option make the related characters behave specially.
-     * This code matches their closure behavior to their case folding behavior.
-     */
-
-    switch(c) {
-    case 0x49:
-        /* regular i and I are in one equivalence class */
-        sa->add(sa->set, 0x69);
-        return;
-    case 0x69:
-        sa->add(sa->set, 0x49);
-        return;
-    case 0x130:
-        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
-        sa->addString(sa->set, iDot, 2);
-        return;
-    case 0x131:
-        /* dotless i is in a class by itself */
-        return;
-    default:
-        /* otherwise use the data file data */
-        break;
-    }
-
-    props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
+    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
      if(!UCASE_HAS_EXCEPTION(props)) {
          if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
              /* add the one simple case mapping, no matter what type it is */
@@ -249,19 +219,42 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
           * c has exceptions, so there may be multiple simple and/or
           * full case mappings. Add them all.
           */
-        const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
-        const char16_t *closure;
+        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
          uint16_t excWord=*pe++;
-        int32_t idx, closureLength, fullLength, length;
-
-        pe0=pe;
+        const uint16_t *pe0=pe;
+
+        // Hardcode the case closure of i and its relatives and ignore the
+        // data file data for these characters.
+        // The Turkic dotless i and dotted I with their case mapping conditions
+        // and case folding option make the related characters behave specially.
+        // This code matches their closure behavior to their case folding behavior.
+        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+            // These characters have Turkic case foldings. Hardcode their closure.
+            if (c == 0x49) {
+                // Regular i and I are in one equivalence class.
+                sa->add(sa->set, 0x69);
+                return;
+            } else if (c == 0x130) {
+                // Dotted I is in a class with <0069 0307>
+                // (for canonical equivalence with <0049 0307>).
+                sa->addString(sa->set, iDot, 2);
+                return;
+            }
+        } else if (c == 0x69) {
+            sa->add(sa->set, 0x49);
+            return;
+        } else if (c == 0x131) {
+            // Dotless i is in a class by itself.
+            return;
+        }
  
          /* add all simple case mappings */
-        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
+        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
              if(HAS_SLOT(excWord, idx)) {
                  pe=pe0;
-                GET_SLOT_VALUE(excWord, idx, pe, c);
-                sa->add(sa->set, c);
+                UChar32 mapping;
+                GET_SLOT_VALUE(excWord, idx, pe, mapping);
+                sa->add(sa->set, mapping);
              }
          }
          if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
@@ -272,6 +265,8 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
          }
  
          /* get the closure string pointer & length */
+        const char16_t *closure;
+        int32_t closureLength;
          if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
              pe=pe0;
              GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
@@ -285,6 +280,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
          /* add the full case folding */
          if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
              pe=pe0;
+            int32_t fullLength;
              GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
  
              /* start of full case mapping strings */
@@ -297,7 +293,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
              fullLength>>=4;
  
              /* add the full case folding string */
-            length=fullLength&0xf;
+            int32_t length=fullLength&0xf;
              if(length!=0) {
                  sa->addString(sa->set, (const char16_t *)pe, length);
                  pe+=length;
@@ -313,9 +309,146 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
          }
  
          /* add each code point in the closure string */
-        for(idx=0; idx<closureLength;) {
-            U16_NEXT_UNSAFE(closure, idx, c);
-            sa->add(sa->set, c);
+        for(int32_t idx=0; idx<closureLength;) {
+            UChar32 mapping;
+            U16_NEXT_UNSAFE(closure, idx, mapping);
+            sa->add(sa->set, mapping);
+        }
+    }
+}
+
+namespace {
+
+/**
+ * Add the simple case closure mapping,
+ * except if there is not actually an scf relationship between the two characters.
+ * TODO: Unicode should probably add the corresponding scf mappings.
+ * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
+ * If & when those scf mappings are added, we should be able to remove all of these exceptions.
+ */
+void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
+    switch (c) {
+    case 0x0390:
+        if (t == 0x1FD3) { return; }
+        break;
+    case 0x03B0:
+        if (t == 0x1FE3) { return; }
+        break;
+    case 0x1FD3:
+        if (t == 0x0390) { return; }
+        break;
+    case 0x1FE3:
+        if (t == 0x03B0) { return; }
+        break;
+    case 0xFB05:
+        if (t == 0xFB06) { return; }
+        break;
+    case 0xFB06:
+        if (t == 0xFB05) { return; }
+        break;
+    default:
+        break;
+    }
+    sa->add(sa->set, t);
+}
+
+}  // namespace
+
+U_CFUNC void U_EXPORT2
+ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
+    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
+    if(!UCASE_HAS_EXCEPTION(props)) {
+        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
+            /* add the one simple case mapping, no matter what type it is */
+            int32_t delta=UCASE_GET_DELTA(props);
+            if(delta!=0) {
+                sa->add(sa->set, c+delta);
+            }
+        }
+    } else {
+        // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
+        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
+        uint16_t excWord=*pe++;
+        const uint16_t *pe0=pe;
+
+        // Hardcode the case closure of i and its relatives and ignore the
+        // data file data for these characters, like in ucase_addCaseClosure().
+        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+            // These characters have Turkic case foldings. Hardcode their closure.
+            if (c == 0x49) {
+                // Regular i and I are in one equivalence class.
+                sa->add(sa->set, 0x69);
+                return;
+            } else if (c == 0x130) {
+                // For scf=Simple_Case_Folding, dotted I is in a class by itself.
+                return;
+            }
+        } else if (c == 0x69) {
+            sa->add(sa->set, 0x49);
+            return;
+        } else if (c == 0x131) {
+            // Dotless i is in a class by itself.
+            return;
+        }
+
+        // Add all simple case mappings.
+        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
+            if(HAS_SLOT(excWord, idx)) {
+                pe=pe0;
+                UChar32 mapping;
+                GET_SLOT_VALUE(excWord, idx, pe, mapping);
+                addOneSimpleCaseClosure(c, mapping, sa);
+            }
+        }
+        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
+            pe=pe0;
+            int32_t delta;
+            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
+            UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
+            addOneSimpleCaseClosure(c, mapping, sa);
+        }
+
+        /* get the closure string pointer & length */
+        const char16_t *closure;
+        int32_t closureLength;
+        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
+            pe=pe0;
+            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
+            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
+            closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
+        } else {
+            closureLength=0;
+            closure=nullptr;
+        }
+
+        // Skip the full case mappings.
+        if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
+            pe=pe0;
+            int32_t fullLength;
+            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
+
+            /* start of full case mapping strings */
+            ++pe;
+
+            fullLength&=0xffff; /* bits 16 and higher are reserved */
+
+            // Skip all 4 full case mappings.
+            pe+=fullLength&UCASE_FULL_LOWER;
+            fullLength>>=4;
+            pe+=fullLength&0xf;
+            fullLength>>=4;
+            pe+=fullLength&0xf;
+            fullLength>>=4;
+            pe+=fullLength;
+
+            closure=(const char16_t *)pe; /* behind full case mappings */
+        }
+
+        // Add each code point in the closure string whose scf maps back to c.
+        for(int32_t idx=0; idx<closureLength;) {
+            UChar32 mapping;
+            U16_NEXT_UNSAFE(closure, idx, mapping);
+            addOneSimpleCaseClosure(c, mapping, sa);
          }
      }
  }
diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h

index d1c3183b240e219ce90ce7e8de0249d4aca5e4e9..e03b311870f8cdbc940d5f01552163038ec1dcd7 100644 (file)
--- a/icu4c/source/common/ucase.h
+++ b/icu4c/source/common/ucase.h
@@ -108,6 +108,10 @@ ucase_fold(UChar32 c, uint32_t options);
  U_CFUNC void U_EXPORT2
  ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
  
+/** Case closure with only scf=Simple_Case_Folding. */
+U_CFUNC void U_EXPORT2
+ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa);
+
  /**
   * Maps the string to single code points and adds the associated case closure
   * mappings.
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h

index 61435e7ac93561627e178e93bc93d79d15668be7..84774d9f36ecf675bc5d3eaa6713fe4cbd709af7 100644 (file)
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -430,7 +430,9 @@ public:
       * description for the syntax of the pattern language.
       * @param pattern a string specifying what characters are in the set
       * @param options bitmask for options to apply to the pattern.
-     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * Valid options are USET_IGNORE_SPACE and
+     * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+     * These case options are mutually exclusive.
       * @param symbols a symbol table mapping variable names to values
       * and stand-in characters to UnicodeSets; may be nullptr
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@@ -450,7 +452,9 @@ public:
       * @param pos on input, the position in pattern at which to start parsing.
       * On output, the position after the last character parsed.
       * @param options bitmask for options to apply to the pattern.
-     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * Valid options are USET_IGNORE_SPACE and
+     * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+     * These case options are mutually exclusive.
       * @param symbols a symbol table mapping variable names to values
       * and stand-in characters to UnicodeSets; may be nullptr
       * @param status input-output error code
@@ -645,7 +649,9 @@ public:
       * A frozen set will not be modified.
       * @param pattern a string specifying what characters are in the set
       * @param options bitmask for options to apply to the pattern.
-     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * Valid options are USET_IGNORE_SPACE and
+     * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+     * These case options are mutually exclusive.
       * @param symbols a symbol table mapping variable names to
       * values and stand-ins to UnicodeSets; may be nullptr
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@@ -683,7 +689,9 @@ public:
       * pattern.length() if the closing ']' is the last character of
       * the pattern string.
       * @param options bitmask for options to apply to the pattern.
-     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * Valid options are USET_IGNORE_SPACE and
+     * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+     * These case options are mutually exclusive.
       * @param symbols a symbol table mapping variable names to
       * values and stand-ins to UnicodeSets; may be nullptr
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
@@ -1390,7 +1398,7 @@ public:
  
      /**
       * Close this set over the given attribute.  For the attribute
-     * USET_CASE, the result is to modify this set so that:
+     * USET_CASE_INSENSITIVE, the result is to modify this set so that:
       *
       * 1. For each character or string 'a' in this set, all strings or
       * characters 'b' such that foldCase(a) == foldCase(b) are added
@@ -1408,8 +1416,10 @@ public:
       * A frozen set will not be modified.
       *
       * @param attribute bitmask for attributes to close over.
-     * Currently only the USET_CASE bit is supported.  Any undefined bits
-     * are ignored.
+     * Valid options:
+     * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+     * These case options are mutually exclusive.
+     * Unrelated options bits are ignored.
       * @return a reference to this set.
       * @stable ICU 4.2
       */
@@ -1579,6 +1589,9 @@ private:
                        int32_t depth,
                        UErrorCode& ec);
  
+    void closeOverCaseInsensitive(bool simple);
+    void closeOverAddCaseMappings();
+
      //----------------------------------------------------------------
      // Implementation: Utility methods
      //----------------------------------------------------------------
diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h

index 5dd890e148d07e9b267a53dbbe8673eb868e06a9..ee4e0036d2289b9075042bc6144bd0c86ada3848 100644 (file)
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@@ -53,6 +53,12 @@ typedef struct USet USet;
  /**
   * Bitmask values to be passed to uset_openPatternOptions() or
   * uset_applyPattern() taking an option parameter.
+ *
+ * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
+ *
+ * Undefined options bits are ignored, and reserved for future use.
+ *
   * @stable ICU 2.4
   */
  enum {
@@ -60,13 +66,13 @@ enum {
       * Ignore white space within patterns unless quoted or escaped.
       * @stable ICU 2.4
       */
-    USET_IGNORE_SPACE = 1,  
+    USET_IGNORE_SPACE = 1,
  
      /**
       * Enable case insensitive matching.  E.g., "[ab]" with this flag
       * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
       * match all except 'a', 'A', 'b', and 'B'. This performs a full
-     * closure over case mappings, e.g. U+017F for s.
+     * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
       *
       * The resulting set is a superset of the input for the code points but
       * not for the strings.
@@ -88,17 +94,36 @@ enum {
       *
       * @stable ICU 2.4
       */
-    USET_CASE_INSENSITIVE = 2,  
+    USET_CASE_INSENSITIVE = 2,
  
      /**
-     * Enable case insensitive matching.  E.g., "[ab]" with this flag
-     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
-     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
-     * title-, and uppercase mappings as well as the case folding
+     * Adds all case mappings for each element in the set.
+     * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
       * of each existing element in the set.
+     *
+     * Unlike the “case insensitive” options, this does not perform a closure.
+     * For example, it does not add 'ſ' (U+017F long s) for 's',
+     * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
+     *
       * @stable ICU 3.2
       */
-    USET_ADD_CASE_MAPPINGS = 4
+    USET_ADD_CASE_MAPPINGS = 4,
+
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Enable case insensitive matching.
+     * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
+     * which map each code point to one code point,
+     * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+     *
+     * This is designed for case-insensitive matches, for example in certain
+     * regular expression implementations where only Simple_Case_Folding mappings are used,
+     * such as in ECMAScript (JavaScript) regular expressions.
+     *
+     * @draft ICU 73
+     */
+    USET_SIMPLE_CASE_INSENSITIVE = 6
+#endif  // U_HIDE_DRAFT_API
  };
  
  /**
@@ -299,7 +324,9 @@ uset_openPattern(const UChar* pattern, int32_t patternLength,
   * @param patternLength the length of the pattern, or -1 if null
   * terminated
   * @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
   * @param ec the error code
   * @stable ICU 2.4
   */
@@ -414,7 +441,10 @@ uset_set(USet* set,
   *                          The character at pattern[0] must be a '['.
   * @param patternLength     The length of the UChar string. -1 if NUL terminated.
   * @param options           A bitmask for options to apply to the pattern.
- *                          Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ *                          Valid options are USET_IGNORE_SPACE and
+ *                          at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
+ *                          USET_SIMPLE_CASE_INSENSITIVE.
+ *                          These case options are mutually exclusive.
   * @param status            Returns an error if the pattern cannot be parsed.
   * @return                  Upon successful parse, the value is either
   *                          the index of the character after the closing ']' 
@@ -804,7 +834,7 @@ uset_clear(USet* set);
  
  /**
   * Close this set over the given attribute.  For the attribute
- * USET_CASE, the result is to modify this set so that:
+ * USET_CASE_INSENSITIVE, the result is to modify this set so that:
   *
   * 1. For each character or string 'a' in this set, all strings or
   * characters 'b' such that foldCase(a) == foldCase(b) are added
@@ -824,8 +854,10 @@ uset_clear(USet* set);
   * @param set the set
   *
   * @param attributes bitmask for attributes to close over.
- * Currently only the USET_CASE bit is supported.  Any undefined bits
- * are ignored.
+ * Valid options:
+ * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
+ * Unrelated options bits are ignored.
   * @stable ICU 4.2
   */
  U_CAPI void U_EXPORT2
diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp

index 1f1fbcf9f8e1118cca45f6183ecb97cdd1ee63a5..173a5cbaaef66178c309522da47d4f17741bd001 100644 (file)
--- a/icu4c/source/common/uniset_closure.cpp
+++ b/icu4c/source/common/uniset_closure.cpp
@@ -25,9 +25,11 @@
  #include "unicode/locid.h"
  #include "unicode/parsepos.h"
  #include "unicode/uniset.h"
+#include "unicode/utf16.h"
  #include "cmemory.h"
  #include "ruleiter.h"
  #include "ucase.h"
+#include "uprops.h"
  #include "util.h"
  #include "uvector.h"
  
@@ -149,102 +151,208 @@ addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeStr
      // see ucase.h
  }
  
+namespace {
+
+/** For case closure on a large set, look only at code points with relevant properties. */
+const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) {
+    // The subset must have been constructed with all code points,
+    // so that the retainAll() intersection effectively copies all single code points from src.
+    U_ASSERT(subset.contains(0, 0x10ffff));
+    if (src.size() < 30) {
+        return src;
+    }
+    // Return the intersection of the src code points with Case_Sensitive ones.
+    UErrorCode errorCode = U_ZERO_ERROR;
+    const UnicodeSet *sensitive =
+        CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode);
+    if (U_FAILURE(errorCode)) {
+        return src;
+    }
+    // Start by copying the "smaller" set.
+    // (We "copy" by intersecting all Unicode *code points* with the first set,
+    // which omits any strings.)
+    if (src.getRangeCount() > sensitive->getRangeCount()) {
+        subset.retainAll(*sensitive);
+        subset.retainAll(src);
+    } else {
+        subset.retainAll(src);
+        subset.retainAll(*sensitive);
+    }
+    return subset;
+}
+
+// Per-character scf = Simple_Case_Folding of a string.
+// (Normally when we case-fold a string we use full case foldings.)
+bool scfString(const UnicodeString &s, UnicodeString &scf) {
+    // Iterate over the raw buffer for best performance.
+    const char16_t *p = s.getBuffer();
+    int32_t length = s.length();
+    // Loop while not needing modification.
+    for (int32_t i = 0; i < length;) {
+        UChar32 c;
+        U16_NEXT(p, i, length, c);  // post-increments i
+        UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+        if (scfChar != c) {
+            // Copy the characters before c.
+            scf.setTo(p, i - U16_LENGTH(c));
+            // Loop over the rest of the string and keep case-folding.
+            for (;;) {
+                scf.append(scfChar);
+                if (i == length) {
+                    return true;
+                }
+                U16_NEXT(p, i, length, c);  // post-increments i
+                scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+            }
+        }
+    }
+    return false;
+}
+
+}  // namespace
+
  UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
      if (isFrozen() || isBogus()) {
          return *this;
      }
-    if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
-        {
-            UnicodeSet foldSet(*this);
-            UnicodeString str;
-            USetAdder sa = {
-                foldSet.toUSet(),
-                _set_add,
-                _set_addRange,
-                _set_addString,
-                nullptr, // don't need remove()
-                nullptr // don't need removeRange()
-            };
-
-            // start with input set to guarantee inclusion
-            // USET_CASE: remove strings because the strings will actually be reduced (folded);
-            //            therefore, start with no strings and add only those needed
-            if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
-                foldSet.strings->removeAllElements();
-            }
+    switch (attribute & USET_CASE_MASK) {
+    case 0:
+        break;
+    case USET_CASE_INSENSITIVE:
+        closeOverCaseInsensitive(/* simple= */ false);
+        break;
+    case USET_ADD_CASE_MAPPINGS:
+        closeOverAddCaseMappings();
+        break;
+    case USET_SIMPLE_CASE_INSENSITIVE:
+        closeOverCaseInsensitive(/* simple= */ true);
+        break;
+    default:
+        // bad option (unreachable)
+        break;
+    }
+    return *this;
+}
+
+void UnicodeSet::closeOverCaseInsensitive(bool simple) {
+    // Start with input set to guarantee inclusion.
+    UnicodeSet foldSet(*this);
+    // Full case mappings closure:
+    // Remove strings because the strings will actually be reduced (folded);
+    // therefore, start with no strings and add only those needed.
+    // Do this before processing code points, because they may add strings.
+    if (!simple && foldSet.hasStrings()) {
+        foldSet.strings->removeAllElements();
+    }
+
+    USetAdder sa = {
+        foldSet.toUSet(),
+        _set_add,
+        _set_addRange,
+        _set_addString,
+        nullptr, // don't need remove()
+        nullptr // don't need removeRange()
+    };
+
+    UnicodeSet subset(0, 0x10ffff);
+    const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
  
-            int32_t n = getRangeCount();
-            UChar32 result;
-            const char16_t *full;
-
-            for (int32_t i=0; i<n; ++i) {
-                UChar32 start = getRangeStart(i);
-                UChar32 end   = getRangeEnd(i);
-
-                if (attribute & USET_CASE_INSENSITIVE) {
-                    // full case closure
-                    for (UChar32 cp=start; cp<=end; ++cp) {
-                        ucase_addCaseClosure(cp, &sa);
-                    }
-                } else {
-                    // add case mappings
-                    // (does not add long s for regular s, or Kelvin for k, for example)
-                    for (UChar32 cp=start; cp<=end; ++cp) {
-                        result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
-                        addCaseMapping(foldSet, result, full, str);
-
-                        result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
-                        addCaseMapping(foldSet, result, full, str);
-
-                        result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
-                        addCaseMapping(foldSet, result, full, str);
-
-                        result = ucase_toFullFolding(cp, &full, 0);
-                        addCaseMapping(foldSet, result, full, str);
-                    }
+    // Iterate over the ranges of single code points. Nested loop for each code point.
+    int32_t n = codePoints.getRangeCount();
+
+    for (int32_t i=0; i<n; ++i) {
+        UChar32 start = codePoints.getRangeStart(i);
+        UChar32 end   = codePoints.getRangeEnd(i);
+
+        if (simple) {
+            for (UChar32 cp=start; cp<=end; ++cp) {
+                ucase_addSimpleCaseClosure(cp, &sa);
+            }
+        } else {
+            for (UChar32 cp=start; cp<=end; ++cp) {
+                ucase_addCaseClosure(cp, &sa);
+            }
+        }
+    }
+    if (hasStrings()) {
+        UnicodeString str;
+        for (int32_t j=0; j<strings->size(); ++j) {
+            const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
+            if (simple) {
+                if (scfString(*pStr, str)) {
+                    foldSet.remove(*pStr).add(str);
+                }
+            } else {
+                str = *pStr;
+                str.foldCase();
+                if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
+                    foldSet.add(str); // does not map to code points: add the folded string itself
                  }
              }
-            if (hasStrings()) {
-                if (attribute & USET_CASE_INSENSITIVE) {
-                    for (int32_t j=0; j<strings->size(); ++j) {
-                        str = *(const UnicodeString *) strings->elementAt(j);
-                        str.foldCase();
-                        if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
-                            foldSet.add(str); // does not map to code points: add the folded string itself
-                        }
-                    }
-                } else {
-                    Locale root("");
-#if !UCONFIG_NO_BREAK_ITERATION
-                    UErrorCode status = U_ZERO_ERROR;
-                    BreakIterator *bi = BreakIterator::createWordInstance(root, status);
-                    if (U_SUCCESS(status)) {
-#endif
-                        const UnicodeString *pStr;
+        }
+    }
+    *this = foldSet;
+}
+
+void UnicodeSet::closeOverAddCaseMappings() {
+    // Start with input set to guarantee inclusion.
+    UnicodeSet foldSet(*this);
+
+    UnicodeSet subset(0, 0x10ffff);
+    const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
  
-                        for (int32_t j=0; j<strings->size(); ++j) {
-                            pStr = (const UnicodeString *) strings->elementAt(j);
-                            (str = *pStr).toLower(root);
-                            foldSet.add(str);
+    // Iterate over the ranges of single code points. Nested loop for each code point.
+    int32_t n = codePoints.getRangeCount();
+    UChar32 result;
+    const char16_t *full;
+    UnicodeString str;
+
+    for (int32_t i=0; i<n; ++i) {
+        UChar32 start = codePoints.getRangeStart(i);
+        UChar32 end   = codePoints.getRangeEnd(i);
+
+        // add case mappings
+        // (does not add long s for regular s, or Kelvin for k, for example)
+        for (UChar32 cp=start; cp<=end; ++cp) {
+            result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+            addCaseMapping(foldSet, result, full, str);
+
+            result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+            addCaseMapping(foldSet, result, full, str);
+
+            result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+            addCaseMapping(foldSet, result, full, str);
+
+            result = ucase_toFullFolding(cp, &full, 0);
+            addCaseMapping(foldSet, result, full, str);
+        }
+    }
+    if (hasStrings()) {
+        Locale root("");
  #if !UCONFIG_NO_BREAK_ITERATION
-                            (str = *pStr).toTitle(bi, root);
-                            foldSet.add(str);
+        UErrorCode status = U_ZERO_ERROR;
+        BreakIterator *bi = BreakIterator::createWordInstance(root, status);
+        if (U_SUCCESS(status)) {
  #endif
-                            (str = *pStr).toUpper(root);
-                            foldSet.add(str);
-                            (str = *pStr).foldCase();
-                            foldSet.add(str);
-                        }
+            for (int32_t j=0; j<strings->size(); ++j) {
+                const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
+                (str = *pStr).toLower(root);
+                foldSet.add(str);
  #if !UCONFIG_NO_BREAK_ITERATION
-                    }
-                    delete bi;
+                (str = *pStr).toTitle(bi, root);
+                foldSet.add(str);
  #endif
-                }
+                (str = *pStr).toUpper(root);
+                foldSet.add(str);
+                (str = *pStr).foldCase();
+                foldSet.add(str);
              }
-            *this = foldSet;
+#if !UCONFIG_NO_BREAK_ITERATION
          }
+        delete bi;
+#endif
      }
-    return *this;
+    *this = foldSet;
  }
  
  U_NAMESPACE_END
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp

index 3f6a154f80272c295d7c08cafc9ffcd8cc6d3673..bb6ce27444c871c6ca30fd4f34cb8201a626e4e4 100644 (file)
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -631,11 +631,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
       * to close over case BEFORE COMPLEMENTING.  This makes
       * patterns like /[^abc]/i work.
       */
-    if ((options & USET_CASE_INSENSITIVE) != 0) {
-        (this->*caseClosure)(USET_CASE_INSENSITIVE);
-    }
-    else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
-        (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
+    if ((options & USET_CASE_MASK) != 0) {
+        (this->*caseClosure)(options);
      }
      if (invert) {
          complement().removeAllStrings();  // code point complement
diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h

index 2004394db64e1bf799f849a89781cced51c377ce..1e06d0351923444928c1877fb235e51c499534df 100644 (file)
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@@ -441,6 +441,7 @@ class CharacterProperties {
  public:
      CharacterProperties() = delete;
      static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
+    static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode);
  };
  
  // implemented in uniset_props.cpp
diff --git a/icu4c/source/common/uset_imp.h b/icu4c/source/common/uset_imp.h

index 7233b9303c3a1737a508164263dd5d4f57723ea6..77197aaed77e1b06f623ca910c0ecc238f00ec64 100644 (file)
--- a/icu4c/source/common/uset_imp.h
+++ b/icu4c/source/common/uset_imp.h
@@ -58,5 +58,14 @@ typedef struct USetAdder USetAdder;
  
  U_CDECL_END
  
-#endif
+#ifdef __cplusplus
+
+namespace {
+
+constexpr int32_t USET_CASE_MASK = USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS;
  
+}  // namespace
+
+#endif  // __cplusplus
+
+#endif
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp

index 5bf3ab12f49e03b302170660f0336de86dd3069a..2ee67e93a4ac7a01c84e72f4b16aa870103ffcd3 100644 (file)
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -14,6 +14,7 @@
  #include <stdio.h>
  
  #include <string.h>
+#include <unordered_map>
  #include "unicode/utypes.h"
  #include "usettest.h"
  #include "unicode/ucnv.h"
@@ -85,6 +86,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
      TESTCASE_AUTO(TestStrings);
      TESTCASE_AUTO(Testj2268);
      TESTCASE_AUTO(TestCloseOver);
+    TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
+    TESTCASE_AUTO(TestCloseOverLargeSets);
      TESTCASE_AUTO(TestEscapePattern);
      TESTCASE_AUTO(TestInvalidCodePoint);
      TESTCASE_AUTO(TestSymbolTable);
@@ -1243,27 +1246,38 @@ void UnicodeSetTest::TestIndexOf() {
   * Test closure API.
   */
  void UnicodeSetTest::TestCloseOver() {
-    UErrorCode ec = U_ZERO_ERROR;
-
-    char CASE[] = {(char)USET_CASE_INSENSITIVE};
-    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
-    const char* DATA[] = {
+    static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
+    static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
+    static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
+    static const char* DATA[] = {
          // selector, input, output
          CASE,
          "[aq\\u00DF{Bc}{bC}{Fi}]",
          "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
  
+        SIMPLE_CASE_INSENSITIVE,
+        "[aq\\u00DF{Bc}{bC}{Fi}]",
+        "[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
+
          CASE,
          "[\\u01F1]", // 'DZ'
          "[\\u01F1\\u01F2\\u01F3]",
  
+        SIMPLE_CASE_INSENSITIVE,
+        "[\\u01F1]", // 'DZ'
+        "[\\u01F1\\u01F2\\u01F3]",
+
          CASE,
          "[\\u1FB4]",
          "[\\u1FB4{\\u03AC\\u03B9}]",
  
+        SIMPLE_CASE_INSENSITIVE,
+        "[\\u1FB4]",
+        "[\\u1FB4]",
+
          CASE,
          "[{F\\uFB01}]",
-        "[\\uFB03{ffi}]",            
+        "[\\uFB03{ffi}]",
  
          CASE, // make sure binary search finds limits
          "[a\\uFF3A]",
@@ -1271,6 +1285,10 @@ void UnicodeSetTest::TestCloseOver() {
  
          CASE,
          "[a-z]","[A-Za-z\\u017F\\u212A]",
+
+        SIMPLE_CASE_INSENSITIVE,
+        "[a-z]","[A-Za-z\\u017F\\u212A]",
+
          CASE,
          "[abc]","[A-Ca-c]",
          CASE,
@@ -1311,7 +1329,7 @@ void UnicodeSetTest::TestCloseOver() {
          CASE_MAPPINGS,
          "[\\u01F1]", // 'DZ'
          "[\\u01F1\\u01F2\\u01F3]",
-        
+
          CASE_MAPPINGS,
          "[a-z]",
          "[A-Za-z]",
@@ -1326,6 +1344,8 @@ void UnicodeSetTest::TestCloseOver() {
          int32_t selector = DATA[i][0];
          UnicodeString pat(DATA[i+1], -1, US_INV);
          UnicodeString exp(DATA[i+2], -1, US_INV);
+
+        UErrorCode ec = U_ZERO_ERROR;
          s.applyPattern(pat, ec);
          s.closeOver(selector);
          t.applyPattern(exp, ec);
@@ -1341,68 +1361,8 @@ void UnicodeSetTest::TestCloseOver() {
          }
      }
  
-#if 0
-    /*
-     * Unused test code.
-     * This was used to compare the old implementation (using USET_CASE)
-     * with the new one (using 0x100 temporarily)
-     * while transitioning from hardcoded case closure tables in uniset.cpp
-     * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
-     * and using ucase.c functions for closure.
-     * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
-     *
-     * Note: The old and new implementation never fully matched because
-     * the old implementation turned out to not map U+0130 and U+0131 correctly
-     * (dotted I and dotless i) and because the old implementation's data tables
-     * were outdated compared to Unicode 4.0.1 at the time of the change to the
-     * new implementation. (So sigmas and some other characters were not handled
-     * according to the newer Unicode version.)
-     */
-    UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
-    UnicodeSetIterator si(sens);
-    UnicodeString str, buf2;
-    const UnicodeString *pStr;
-    UChar32 c;
-    while(si.next()) {
-        if(!si.isString()) {
-            c=si.getCodepoint();
-            s.clear();
-            s.add(c);
-
-            str.setTo(c);
-            str.foldCase();
-            sens2.add(str);
-
-            t=s;
-            s.closeOver(USET_CASE);
-            t.closeOver(0x100);
-            if(s!=t) {
-                errln("FAIL: closeOver(U+%04x) differs: ", c);
-                errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
-            }
-        }
-    }
-    // remove all code points
-    // should contain all full case folding mapping strings
-    sens2.remove(0, 0x10ffff);
-    si.reset(sens2);
-    while(si.next()) {
-        if(si.isString()) {
-            pStr=&si.getString();
-            s.clear();
-            s.add(*pStr);
-            t=s2=s;
-            s.closeOver(USET_CASE);
-            t.closeOver(0x100);
-            if(s!=t) {
-                errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, true)+") differs: ");
-                errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
-            }
-        }
-    }
-#endif
-
      // Test the pattern API
+    UErrorCode ec = U_ZERO_ERROR;
      s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
      if (U_FAILURE(ec)) {
          errln("FAIL: applyPattern failed");
@@ -1423,6 +1383,123 @@ void UnicodeSetTest::TestCloseOver() {
      }
  }
  
+namespace {
+
+void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
+                 std::unordered_multimap<UChar32, UChar32> &additions) {
+    for (auto it = closure.find(c);; ++it) {
+        if (it == closure.end() || it->first != c) {
+            // absent
+            additions.insert({c, t});
+            break;
+        } else if (it->second == t) {
+            // present
+            break;
+        }
+    }
+}
+
+}  // namespace
+
+void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
+    IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
+    const UnicodeSet *sensitive =
+        UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
+    if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
+        return;
+    }
+    // Compute the scf=Simple_Case_Folding closure:
+    // For each scf(c)=t, start with mappings c->t and t->c.
+    std::unordered_multimap<UChar32, UChar32> closure;
+    UnicodeSetIterator iter(*sensitive);
+    while (iter.next()) {
+        UChar32 c = iter.getCodepoint();
+        UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+        if (scfChar != c) {
+            closure.insert({c, scfChar});
+            closure.insert({scfChar, c});
+        }
+    }
+    // Complete the closure: Add mappings of mappings.
+    for (;;) {
+        std::unordered_multimap<UChar32, UChar32> additions;
+        // for each mapping c->t
+        for (auto mapping : closure) {
+            UChar32 c = mapping.first;
+            UChar32 t = mapping.second;
+            // enumerate each t->u
+            for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
+                UChar32 u = it->second;
+                if (u != c) {
+                    addIfAbsent(closure, c, u, additions);
+                    addIfAbsent(closure, u, c, additions);
+                }
+            }
+        }
+        if (additions.empty()) {
+            break;  // The closure is complete.
+        }
+        closure.insert(additions.begin(), additions.end());
+    }
+    // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
+    // Here we focus on single code points as input.
+    // Other examples, including strings, are tested in TestCloseOver().
+    int32_t errors = 0;
+    iter.reset();
+    UnicodeSet set, expected;
+    while (iter.next()) {
+        UChar32 c = iter.getCodepoint();
+        // closeOver()
+        set.clear().add(c);
+        set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
+        // From-first-principles implementation.
+        expected.clear().add(c);
+        for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
+            expected.add(it->second);
+        }
+        // compare
+        if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
+            errln("    c=U+%04X", c);
+            if (++errors == 10) {
+                break;
+            }
+        }
+    }
+}
+
+void UnicodeSetTest::TestCloseOverLargeSets() {
+    IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
+    // Check that an optimization for large sets does not change the result.
+
+    // Most code points except ones that are boring for case mappings.
+    UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
+    // Main Unihan block.
+    constexpr UChar32 LARGE_START = 0x4E00;
+    constexpr UChar32 LARGE_END = 0x9FFF;
+
+    static constexpr int32_t OPTIONS[] = {
+        USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
+    };
+    UnicodeSet input, small, large;
+    for (int32_t option : OPTIONS) {
+        UnicodeSetIterator iter(manyCp);
+        while (iter.next()) {
+            UChar32 c = iter.getCodepoint();
+            input.clear().add(c);
+            small = input;
+            small.closeOver(option);
+            large = input;
+            large.add(LARGE_START, LARGE_END);
+            large.closeOver(option);
+            large.remove(LARGE_START, LARGE_END);
+            if (!checkEqual(small, large, "small != large")) {
+                errln("    option=%d c=U+%04X", option, c);
+                break;
+            }
+        }
+    }
+}
+
  void UnicodeSetTest::TestEscapePattern() {
      const char pattern[] =
          "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h

index 9271edbb04e24220bc0b57e44c050f546b2d73fc..3cb5dc14e8a3d6807cdb5dc56bb999ee88af8179 100644 (file)
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -74,6 +74,8 @@ private:
      void TestExhaustive(void);
  
      void TestCloseOver(void);
+    void TestCloseOverSimpleCaseFolding();
+    void TestCloseOverLargeSets();
  
      void TestEscapePattern(void);
  
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java

index fced2b62986b4d71ab1cccb348765b106587f3a5..aee590f49114d95268f18f303560e759ad6a13c3 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java
@@ -260,34 +260,6 @@ public final class UCaseProps {
       * - for k include the Kelvin sign
       */
      public final void addCaseClosure(int c, UnicodeSet set) {
-        /*
-         * Hardcode the case closure of i and its relatives and ignore the
-         * data file data for these characters.
-         * The Turkic dotless i and dotted I with their case mapping conditions
-         * and case folding option make the related characters behave specially.
-         * This code matches their closure behavior to their case folding behavior.
-         */
-
-        switch(c) {
-        case 0x49:
-            /* regular i and I are in one equivalence class */
-            set.add(0x69);
-            return;
-        case 0x69:
-            set.add(0x49);
-            return;
-        case 0x130:
-            /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
-            set.add(iDot);
-            return;
-        case 0x131:
-            /* dotless i is in a class by itself */
-            return;
-        default:
-            /* otherwise use the data file data */
-            break;
-        }
-
          int props=trie.get(c);
          if(!propsHasException(props)) {
              if(getTypeFromProps(props)!=NONE) {
@@ -302,19 +274,41 @@ public final class UCaseProps {
               * c has exceptions, so there may be multiple simple and/or
               * full case mappings. Add them all.
               */
-            int excOffset0, excOffset=getExceptionsOffset(props);
-            int closureOffset;
+            int excOffset=getExceptionsOffset(props);
              int excWord=exceptions.charAt(excOffset++);
-            int index, closureLength, fullLength, length;
-
-            excOffset0=excOffset;
+            int excOffset0=excOffset;
+
+            // Hardcode the case closure of i and its relatives and ignore the
+            // data file data for these characters.
+            // The Turkic dotless i and dotted I with their case mapping conditions
+            // and case folding option make the related characters behave specially.
+            // This code matches their closure behavior to their case folding behavior.
+            if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
+                // These characters have Turkic case foldings. Hardcode their closure.
+                if (c == 0x49) {
+                    // Regular i and I are in one equivalence class.
+                    set.add(0x69);
+                    return;
+                } else if (c == 0x130) {
+                    // Dotted I is in a class with <0069 0307>
+                    // (for canonical equivalence with <0049 0307>).
+                    set.add(iDot);
+                    return;
+                }
+            } else if (c == 0x69) {
+                set.add(0x49);
+                return;
+            } else if (c == 0x131) {
+                // Dotless i is in a class by itself.
+                return;
+            }
  
              /* add all simple case mappings */
-            for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
+            for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
                  if(hasSlot(excWord, index)) {
                      excOffset=excOffset0;
-                    c=getSlotValue(excWord, index, excOffset);
-                    set.add(c);
+                    int mapping=getSlotValue(excWord, index, excOffset);
+                    set.add(mapping);
                  }
              }
              if(hasSlot(excWord, EXC_DELTA)) {
@@ -324,6 +318,7 @@ public final class UCaseProps {
              }
  
              /* get the closure string pointer & length */
+            int closureOffset, closureLength;
              if(hasSlot(excWord, EXC_CLOSURE)) {
                  excOffset=excOffset0;
                  long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
@@ -338,7 +333,7 @@ public final class UCaseProps {
              if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
                  excOffset=excOffset0;
                  long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
-                fullLength=(int)value;
+                int fullLength=(int)value;
  
                  /* start of full case mapping strings */
                  excOffset=(int)(value>>32)+1;
@@ -350,7 +345,7 @@ public final class UCaseProps {
                  fullLength>>=4;
  
                  /* add the full case folding string */
-                length=fullLength&0xf;
+                int length=fullLength&0xf;
                  if(length!=0) {
                      set.add(exceptions.substring(excOffset, excOffset+length));
                      excOffset+=length;
@@ -367,9 +362,137 @@ public final class UCaseProps {
  
              /* add each code point in the closure string */
              int limit=closureOffset+closureLength;
-            for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
-                c=exceptions.codePointAt(index);
-                set.add(c);
+            for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
+                int mapping=exceptions.codePointAt(index);
+                set.add(mapping);
+            }
+        }
+    }
+
+    /**
+     * Add the simple case closure mapping,
+     * except if there is not actually an scf relationship between the two characters.
+     * TODO: Unicode should probably add the corresponding scf mappings.
+     * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
+     * If & when those scf mappings are added, we should be able to remove all of these exceptions.
+     */
+    private static void addOneSimpleCaseClosure(int c, int t, UnicodeSet set) {
+        switch (c) {
+        case 0x0390:
+            if (t == 0x1FD3) { return; }
+            break;
+        case 0x03B0:
+            if (t == 0x1FE3) { return; }
+            break;
+        case 0x1FD3:
+            if (t == 0x0390) { return; }
+            break;
+        case 0x1FE3:
+            if (t == 0x03B0) { return; }
+            break;
+        case 0xFB05:
+            if (t == 0xFB06) { return; }
+            break;
+        case 0xFB06:
+            if (t == 0xFB05) { return; }
+            break;
+        default:
+            break;
+        }
+        set.add(t);
+    }
+
+    public final void addSimpleCaseClosure(int c, UnicodeSet set) {
+        int props=trie.get(c);
+        if(!propsHasException(props)) {
+            if(getTypeFromProps(props)!=NONE) {
+                /* add the one simple case mapping, no matter what type it is */
+                int delta=getDelta(props);
+                if(delta!=0) {
+                    set.add(c+delta);
+                }
+            }
+        } else {
+            // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
+            int excOffset=getExceptionsOffset(props);
+            int excWord=exceptions.charAt(excOffset++);
+            int excOffset0=excOffset;
+
+            // Hardcode the case closure of i and its relatives and ignore the
+            // data file data for these characters, like in ucase_addCaseClosure().
+            if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
+                // These characters have Turkic case foldings. Hardcode their closure.
+                if (c == 0x49) {
+                    // Regular i and I are in one equivalence class.
+                    set.add(0x69);
+                    return;
+                } else if (c == 0x130) {
+                    // For scf=Simple_Case_Folding, dotted I is in a class by itself.
+                    return;
+                }
+            } else if (c == 0x69) {
+                set.add(0x49);
+                return;
+            } else if (c == 0x131) {
+                // Dotless i is in a class by itself.
+                return;
+            }
+
+            // Add all simple case mappings.
+            for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
+                if(hasSlot(excWord, index)) {
+                    excOffset=excOffset0;
+                    int mapping=getSlotValue(excWord, index, excOffset);
+                    addOneSimpleCaseClosure(c, mapping, set);
+                }
+            }
+            if(hasSlot(excWord, EXC_DELTA)) {
+                excOffset=excOffset0;
+                int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
+                int mapping = (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
+                addOneSimpleCaseClosure(c, mapping, set);
+            }
+
+            /* get the closure string pointer & length */
+            int closureOffset, closureLength;
+            if(hasSlot(excWord, EXC_CLOSURE)) {
+                excOffset=excOffset0;
+                long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
+                closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
+                closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
+            } else {
+                closureLength=0;
+                closureOffset=0;
+            }
+
+            // Skip the full case mappings.
+            if(closureLength > 0 && hasSlot(excWord, EXC_FULL_MAPPINGS)) {
+                excOffset=excOffset0;
+                long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
+                int fullLength=(int)value;
+
+                /* start of full case mapping strings */
+                excOffset=(int)(value>>32)+1;
+
+                fullLength&=0xffff; /* bits 16 and higher are reserved */
+
+                // Skip all 4 full case mappings.
+                excOffset+=fullLength&FULL_LOWER;
+                fullLength>>=4;
+                excOffset+=fullLength&0xf;
+                fullLength>>=4;
+                excOffset+=fullLength&0xf;
+                fullLength>>=4;
+                excOffset+=fullLength;
+
+                closureOffset=excOffset; /* behind full case mappings */
+            }
+
+            // Add each code point in the closure string whose scf maps back to c.
+            int limit=closureOffset+closureLength;
+            for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
+                int mapping=exceptions.codePointAt(index);
+                addOneSimpleCaseClosure(c, mapping, set);
              }
          }
      }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index bf6dd6e46ae575f770282432d901d5baa8f53d3e..68f12aff2a33f1aaeffa540ab69224f0f5c618d6 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -459,7 +459,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * for the syntax of the pattern language.
       * @param pattern a string specifying what characters are in the set
       * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
       * @exception java.lang.IllegalArgumentException if the pattern contains
       * a syntax error.
       * @stable ICU 3.8
@@ -495,7 +497,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * @param symbols a symbol table mapping variables to char[] arrays
       * and chars to UnicodeSets
       * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
       * @exception java.lang.IllegalArgumentException if the pattern
       * contains a syntax error.
       * @stable ICU 3.2
@@ -587,7 +591,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * See the class description for the syntax of the pattern language.
       * @param pattern a string specifying what characters are in the set
       * @param options a bitmask indicating which options to apply.
-     * Valid options are IGNORE_SPACE and CASE.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
       * @exception java.lang.IllegalArgumentException if the pattern
       * contains a syntax error.
       * @stable ICU 3.8
@@ -2584,8 +2590,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * variables, or null if none.
       * @param rebuiltPat the pattern that was parsed, rebuilt or
       * copied from the input pattern, as appropriate.
-     * @param options a bit mask of zero or more of the following:
-     * IGNORE_SPACE, CASE.
+     * @param options a bit mask.
+     * Valid options are {@link #IGNORE_SPACE} and
+     * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
       */
      private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
              Appendable rebuiltPat, int options, int depth) {
@@ -2965,8 +2973,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
           * to close over case BEFORE COMPLEMENTING.  This makes
           * patterns like /[^abc]/i work.
           */
-        if ((options & CASE) != 0) {
-            closeOver(CASE);
+        if ((options & CASE_MASK) != 0) {
+            closeOver(options);
          }
          if (invert) {
              complement().removeAllStrings();  // code point complement
@@ -3861,58 +3869,81 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      public static final int IGNORE_SPACE = 1;
  
      /**
-     * Bitmask for constructor, applyPattern(), and closeOver()
-     * indicating letter case.  This may be ORed together with other
-     * selectors.
+     * Alias for {@link #CASE_INSENSITIVE}.
       *
+     * @stable ICU 3.8
+     */
+    public static final int CASE = 2;
+
+    /**
       * Enable case insensitive matching.  E.g., "[ab]" with this flag
       * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
       * match all except 'a', 'A', 'b', and 'B'. This performs a full
-     * closure over case mappings, e.g. U+017F for s.
+     * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
       *
-     * The resulting set is a superset of the input for the code points but
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * <p>The resulting set is a superset of the input for the code points but
       * not for the strings.
       * It performs a case mapping closure of the code points and adds
       * full case folding strings for the code points, and reduces strings of
       * the original set to their full case folding equivalents.
       *
-     * This is designed for case-insensitive matches, for example
+     * <p>This is designed for case-insensitive matches, for example
       * in regular expressions. The full code point case closure allows checking of
       * an input character directly against the closure set.
       * Strings are matched by comparing the case-folded form from the closure
       * set with an incremental case folding of the string in question.
       *
-     * The closure set will also contain single code points if the original
+     * <p>The closure set will also contain single code points if the original
       * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
       * This is not necessary (that is, redundant) for the above matching method
       * but results in the same closure sets regardless of whether the original
       * set contained the code point or a string.
-     * @stable ICU 3.8
-     */
-    public static final int CASE = 2;
-
-    /**
-     * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
-     * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
-     * @see #CASE
+     *
       * @stable ICU 3.4
       */
      public static final int CASE_INSENSITIVE = 2;
  
      /**
-     * Bitmask for constructor, applyPattern(), and closeOver()
-     * indicating letter case.  This may be ORed together with other
-     * selectors.
-     *
-     * Enable case insensitive matching.  E.g., "[ab]" with this flag
-     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
-     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
-     * title-, and uppercase mappings as well as the case folding
+     * Adds all case mappings for each element in the set.
+     * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
       * of each existing element in the set.
+     *
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * <p>Unlike the “case insensitive” options, this does not perform a closure.
+     * For example, it does not add 'ſ' (U+017F long s) for 's',
+     * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
+     *
       * @stable ICU 3.4
       */
      public static final int ADD_CASE_MAPPINGS = 4;
  
+    /**
+     * Enable case insensitive matching.
+     * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
+     * which map each code point to one code point,
+     * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+     *
+     * <p>This is designed for case-insensitive matches, for example in certain
+     * regular expression implementations where only Simple_Case_Folding mappings are used,
+     * such as in ECMAScript (JavaScript) regular expressions.
+     *
+     * <p>This value is an options bit set value for some
+     * constructors, applyPattern(), and closeOver().
+     * It can be ORed together with other, unrelated options.
+     *
+     * @draft ICU 73
+     */
+    public static final int SIMPLE_CASE_INSENSITIVE = 6;
+
+    private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
+
      //  add the result of a full case mapping to the set
      //  use str as a temporary string to avoid constructing one
      private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
@@ -3930,97 +3961,191 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          // see UCaseProps
      }
  
+    /** For case closure on a large set, look only at code points with relevant properties. */
+    UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
+        if (src.size() < 30) {
+            return src;
+        }
+        // Return the intersection of the src code points with Case_Sensitive ones.
+        UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+        // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
+        if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
+            return sensitive.cloneAsThawed().retainAll(src);
+        } else {
+            return ((UnicodeSet) src.clone()).retainAll(sensitive);
+        }
+    }
+
+    // Per-character scf = Simple_Case_Folding of a string.
+    // (Normally when we case-fold a string we use full case foldings.)
+    private static final boolean scfString(CharSequence s, StringBuilder scf) {
+        int length = s.length();
+        // Loop while not needing modification.
+        for (int i = 0; i < length;) {
+            int c = Character.codePointAt(s, i);
+            int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+            if (scfChar != c) {
+                // Copy the characters before c.
+                scf.setLength(0);
+                scf.append(s, 0, i);
+                // Loop over the rest of the string and keep case-folding.
+                for (;;) {
+                    scf.appendCodePoint(scfChar);
+                    i += Character.charCount(c);
+                    if (i == length) {
+                        return true;
+                    }
+                    c = Character.codePointAt(s, i);
+                    scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+                }
+            }
+            i += Character.charCount(c);
+        }
+        return false;
+    }
+
      /**
       * Close this set over the given attribute.  For the attribute
-     * CASE, the result is to modify this set so that:
+     * {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
       *
-     * 1. For each character or string 'a' in this set, all strings
+     * <ol>
+     * <li>For each character or string 'a' in this set, all strings
       * 'b' such that foldCase(a) == foldCase(b) are added to this set.
       * (For most 'a' that are single characters, 'b' will have
       * b.length() == 1.)
       *
-     * 2. For each string 'e' in the resulting set, if e !=
+     * <li>For each string 'e' in the resulting set, if e !=
       * foldCase(e), 'e' will be removed.
+     * </ol>
       *
-     * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
+     * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
       *
-     * (Here foldCase(x) refers to the operation
+     * <p>(Here foldCase(x) refers to the operation
       * UCharacter.foldCase(x, true), and a == b actually denotes
       * a.equals(b), not pointer comparison.)
       *
       * @param attribute bitmask for attributes to close over.
-     * Currently only the CASE bit is supported.  Any undefined bits
-     * are ignored.
+     * Valid options:
+     * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+     * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
+     * Unrelated options bits are ignored.
       * @return a reference to this set.
       * @stable ICU 3.8
       */
      public UnicodeSet closeOver(int attribute) {
          checkFrozen();
-        if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
-            UCaseProps csp = UCaseProps.INSTANCE;
-            UnicodeSet foldSet = new UnicodeSet(this);
-            ULocale root = ULocale.ROOT;
-
-            // start with input set to guarantee inclusion
-            // CASE: remove strings because the strings will actually be reduced (folded);
-            //       therefore, start with no strings and add only those needed
-            if((attribute & CASE) != 0 && foldSet.hasStrings()) {
-                foldSet.strings.clear();
-            }
-
-            int n = getRangeCount();
-            int result;
-            StringBuilder full = new StringBuilder();
+        switch (attribute & CASE_MASK) {
+        case 0:
+            break;
+        case CASE_INSENSITIVE:
+            closeOverCaseInsensitive(/* simple= */ false);
+            break;
+        case ADD_CASE_MAPPINGS:
+            closeOverAddCaseMappings();
+            break;
+        case SIMPLE_CASE_INSENSITIVE:
+            closeOverCaseInsensitive(/* simple= */ true);
+            break;
+        default:
+            // bad option (unreachable)
+            break;
+        }
+        return this;
+    }
  
-            for (int i=0; i<n; ++i) {
-                int start = getRangeStart(i);
-                int end   = getRangeEnd(i);
+    private void closeOverCaseInsensitive(boolean simple) {
+        UCaseProps csp = UCaseProps.INSTANCE;
+        // Start with input set to guarantee inclusion.
+        UnicodeSet foldSet = new UnicodeSet(this);
  
-                if((attribute & CASE) != 0) {
-                    // full case closure
-                    for (int cp=start; cp<=end; ++cp) {
-                        csp.addCaseClosure(cp, foldSet);
-                    }
-                } else {
-                    // add case mappings
-                    // (does not add long s for regular s, or Kelvin for k, for example)
-                    for (int cp=start; cp<=end; ++cp) {
-                        result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        // Full case mappings closure:
+        // Remove strings because the strings will actually be reduced (folded);
+        // therefore, start with no strings and add only those needed.
+        // Do this before processing code points, because they may add strings.
+        if (!simple && foldSet.hasStrings()) {
+            foldSet.strings.clear();
+        }
  
-                        result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
  
-                        result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
-                        addCaseMapping(foldSet, result, full);
+        // Iterate over the ranges of single code points. Nested loop for each code point.
+        int n = codePoints.getRangeCount();
+        for (int i=0; i<n; ++i) {
+            int start = codePoints.getRangeStart(i);
+            int end   = codePoints.getRangeEnd(i);
  
-                        result = csp.toFullFolding(cp, full, 0);
-                        addCaseMapping(foldSet, result, full);
-                    }
+            if (simple) {
+                for (int cp=start; cp<=end; ++cp) {
+                    csp.addSimpleCaseClosure(cp, foldSet);
+                }
+            } else {
+                for (int cp=start; cp<=end; ++cp) {
+                    csp.addCaseClosure(cp, foldSet);
                  }
              }
-            if (hasStrings()) {
-                if ((attribute & CASE) != 0) {
-                    for (String s : strings) {
-                        String str = UCharacter.foldCase(s, 0);
-                        if(!csp.addStringCaseClosure(str, foldSet)) {
-                            foldSet.add(str); // does not map to code points: add the folded string itself
-                        }
+        }
+        if (hasStrings()) {
+            StringBuilder sb = simple ? new StringBuilder() : null;
+            for (String s : strings) {
+                if (simple) {
+                    if (scfString(s, sb)) {
+                        foldSet.remove(s).add(sb);
                      }
                  } else {
-                    BreakIterator bi = BreakIterator.getWordInstance(root);
-                    for (String str : strings) {
-                        // TODO: call lower-level functions
-                        foldSet.add(UCharacter.toLowerCase(root, str));
-                        foldSet.add(UCharacter.toTitleCase(root, str, bi));
-                        foldSet.add(UCharacter.toUpperCase(root, str));
-                        foldSet.add(UCharacter.foldCase(str, 0));
+                    String str = UCharacter.foldCase(s, 0);
+                    if(!csp.addStringCaseClosure(str, foldSet)) {
+                        foldSet.add(str); // does not map to code points: add the folded string itself
                      }
                  }
              }
-            set(foldSet);
          }
-        return this;
+        set(foldSet);
+    }
+
+    private void closeOverAddCaseMappings() {
+        UCaseProps csp = UCaseProps.INSTANCE;
+        // Start with input set to guarantee inclusion.
+        UnicodeSet foldSet = new UnicodeSet(this);
+
+        UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
+
+        // Iterate over the ranges of single code points. Nested loop for each code point.
+        int n = codePoints.getRangeCount();
+        int result;
+        StringBuilder full = new StringBuilder();
+
+        for (int i=0; i<n; ++i) {
+            int start = codePoints.getRangeStart(i);
+            int end   = codePoints.getRangeEnd(i);
+
+            // add case mappings
+            // (does not add long s for regular s, or Kelvin for k, for example)
+            for (int cp=start; cp<=end; ++cp) {
+                result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
+                addCaseMapping(foldSet, result, full);
+
+                result = csp.toFullFolding(cp, full, 0);
+                addCaseMapping(foldSet, result, full);
+            }
+        }
+        if (hasStrings()) {
+            ULocale root = ULocale.ROOT;
+            BreakIterator bi = BreakIterator.getWordInstance(root);
+            for (String str : strings) {
+                // TODO: call lower-level functions
+                foldSet.add(UCharacter.toLowerCase(root, str));
+                foldSet.add(UCharacter.toTitleCase(root, str, bi));
+                foldSet.add(UCharacter.toUpperCase(root, str));
+                foldSet.add(UCharacter.foldCase(str, 0));
+            }
+        }
+        set(foldSet);
      }
  
      /**
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

index e31d92bcba46603f9a55388eadd76186ba9bc02a..deac1ac3ec8a2341b01237ac874715b44063722c 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -20,6 +20,7 @@ import java.util.HashSet;
  import java.util.Iterator;
  import java.util.LinkedHashSet;
  import java.util.List;
+import java.util.Map;
  import java.util.Set;
  import java.util.SortedSet;
  import java.util.TreeSet;
@@ -32,6 +33,7 @@ import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.dev.util.CollectionUtilities;
  import com.ibm.icu.impl.SortedSetRelation;
  import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.CharacterProperties;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
  import com.ibm.icu.lang.UProperty;
@@ -1323,38 +1325,98 @@ public class UnicodeSetTest extends TestFmwk {
      @Test
      public void TestCloseOver() {
          String CASE = String.valueOf(UnicodeSet.CASE);
+        String CASE_MAPPINGS = String.valueOf(UnicodeSet.ADD_CASE_MAPPINGS);
+        String SIMPLE_CASE_INSENSITIVE = String.valueOf(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
          String[] DATA = {
                  // selector, input, output
                  CASE,
                  "[aq\u00DF{Bc}{bC}{Fi}]",
                  "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
  
+                SIMPLE_CASE_INSENSITIVE,
+                "[aq\u00DF{Bc}{bC}{Fi}]",
+                "[aAqQ\u00DF\u1E9E{bc}{fi}]",
+
                  CASE,
                  "[\u01F1]", // 'DZ'
                  "[\u01F1\u01F2\u01F3]",
  
+                SIMPLE_CASE_INSENSITIVE,
+                "[\u01F1]", // 'DZ'
+                "[\u01F1\u01F2\u01F3]",
+
                  CASE,
                  "[\u1FB4]",
                  "[\u1FB4{\u03AC\u03B9}]",
  
+                SIMPLE_CASE_INSENSITIVE,
+                "[\u1FB4]",
+                "[\u1FB4]",
+
                  CASE,
                  "[{F\uFB01}]",
                  "[\uFB03{ffi}]",
  
+                CASE, // make sure binary search finds limits
+                "[a\uFF3A]",
+                "[aA\uFF3A\uFF5A]",
+
                  CASE,
                  "[a-z]","[A-Za-z\u017F\u212A]",
+
+                SIMPLE_CASE_INSENSITIVE,
+                "[a-z]","[A-Za-z\u017F\u212A]",
+
                  CASE,
                  "[abc]","[A-Ca-c]",
                  CASE,
                  "[ABC]","[A-Ca-c]",
+
+                CASE, "[i]", "[iI]",
+
+                CASE, "[\u0130]",          "[\u0130{i\u0307}]", // dotted I
+                CASE, "[{i\u0307}]",       "[\u0130{i\u0307}]", // i with dot
+
+                CASE, "[\u0131]",          "[\u0131]", // dotless i
+
+                CASE, "[\u0390]",          "[\u0390\u1FD3{\u03B9\u0308\u0301}]",
+
+                CASE, "[\u03c2]",          "[\u03a3\u03c2\u03c3]", // sigmas
+
+                CASE, "[\u03f2]",          "[\u03f2\u03f9]", // lunate sigmas
+
+                CASE, "[\u03f7]",          "[\u03f7\u03f8]",
+
+                CASE, "[\u1fe3]",          "[\u03b0\u1fe3{\u03c5\u0308\u0301}]",
+
+                CASE, "[\ufb05]",          "[\ufb05\ufb06{st}]",
+                CASE, "[{st}]",             "[\ufb05\ufb06{st}]",
+
+                CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
+
+                CASE, "[{a\u02BE}]",       "[\u1E9A{a\u02BE}]", // first in sorted table
+
+                CASE, "[{\u1f7c\u03b9}]", "[\u1ff2{\u1f7c\u03b9}]", // last in sorted table
+
+                CASE_MAPPINGS,
+                "[aq\u00DF{Bc}{bC}{Fi}]",
+                "[aAqQ\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
+
+                CASE_MAPPINGS,
+                "[\u01F1]", // 'DZ'
+                "[\u01F1\u01F2\u01F3]",
+
+                CASE_MAPPINGS,
+                "[a-z]",
+                "[A-Za-z]",
          };
  
          UnicodeSet s = new UnicodeSet();
          UnicodeSet t = new UnicodeSet();
          for (int i=0; i<DATA.length; i+=3) {
              int selector = Integer.parseInt(DATA[i]);
-            String pat = DATA[i+1];
-            String exp = DATA[i+2];
+            String pat = Utility.unescape(DATA[i+1]);
+            String exp = Utility.unescape(DATA[i+2]);
              s.applyPattern(pat);
              s.closeOver(selector);
              t.applyPattern(exp);
@@ -1371,6 +1433,149 @@ public class UnicodeSetTest extends TestFmwk {
          expectContainment(s, "abcABC", "defDEF");
          s = new UnicodeSet("[^abc]", UnicodeSet.CASE);
          expectContainment(s, "defDEF", "abcABC");
+        s = new UnicodeSet("[abck]", UnicodeSet.ADD_CASE_MAPPINGS);
+        expectContainment(s, "abckABCK", "defDEF\u212A");
+    }
+
+    private void add(Map<Integer, Collection<Integer>> closure, Integer c, Integer t) {
+        Collection<Integer> values = closure.get(c);
+        if (values == null) {
+            values = new TreeSet<>();
+            closure.put(c, values);
+        }
+        values.add(t);
+    }
+
+    private void addIfAbsent(Map<Integer, Collection<Integer>> closure, Integer c, Integer t,
+            Map<Integer, Collection<Integer>> additions) {
+        Collection<Integer> values = closure.get(c);
+        if (values == null || !values.contains(t)) {
+            if (additions != closure) {
+                values = additions.get(c);
+            }
+            if (values == null) {
+                values = new TreeSet<>();
+                additions.put(c, values);
+            }
+            values.add(t);
+        }
+    }
+
+    @Test
+    public void TestCloseOverSimpleCaseFolding() {
+        UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+        // Compute the scf=Simple_Case_Folding closure:
+        // For each scf(c)=t, start with mappings c->t and t->c.
+
+        // Poor man's multimap from code points to code points.
+        Map<Integer, Collection<Integer>> closure = new HashMap<>();
+        UnicodeSetIterator iter = new UnicodeSetIterator(sensitive);
+        while (iter.next()) {
+            int c = iter.codepoint;
+            int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+            if (scfChar != c) {
+                add(closure, c, scfChar);
+                add(closure, scfChar, c);
+            }
+        }
+        // Complete the closure: Add mappings of mappings.
+        Map<Integer, Collection<Integer>> additions = new HashMap<>();
+        for (;;) {
+            // for each mapping c->t
+            for (Map.Entry<Integer, Collection<Integer>> entry : closure.entrySet()) {
+                Integer c = entry.getKey();
+                Collection<Integer> cValues = entry.getValue();
+                for (Integer t : cValues) {
+                    // enumerate each t->u
+                    Collection<Integer> tValues = closure.get(t);
+                    if (tValues != null) {
+                        for (Integer u : tValues) {
+                            if (!u.equals(c)) {
+                                addIfAbsent(closure, c, u, additions);
+                                addIfAbsent(closure, u, c, additions);
+                            }
+                        }
+                    }
+                }
+
+            }
+            if (additions.isEmpty()) {
+                break;  // The closure is complete.
+            }
+            // Add all of the additions back into the closure.
+            for (Map.Entry<Integer, Collection<Integer>> entry : additions.entrySet()) {
+                Integer c = entry.getKey();
+                Collection<Integer> cValues = entry.getValue();
+                Collection<Integer> closureValues = closure.get(c);
+                if (closureValues == null) {
+                    closureValues = new TreeSet<>();
+                    closure.put(c, closureValues);
+                }
+                closureValues.addAll(cValues);
+            }
+            additions.clear();
+        }
+        // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
+        // Here we focus on single code points as input.
+        // Other examples, including strings, are tested in TestCloseOver().
+        int errors = 0;
+        iter.reset();
+        UnicodeSet set = new UnicodeSet(), expected = new UnicodeSet();
+        while (iter.next()) {
+            int c = iter.codepoint;
+            // closeOver()
+            set.clear().add(c);
+            set.closeOver(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
+            // From-first-principles implementation.
+            expected.clear().add(c);
+            Collection<Integer> values = closure.get(c);
+            if (values != null) {
+                for (Integer t : values) {
+                    expected.add(t);
+                }
+            }
+            // compare
+            if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
+                errln("    c=U+" + Utility.hex(c));
+                if (++errors == 10) {
+                    break;
+                }
+            }
+        }
+    }
+
+    @Test
+    public void TestCloseOverLargeSets() {
+        // Check that an optimization for large sets does not change the result.
+
+        // Most code points except ones that are boring for case mappings.
+        UnicodeSet manyCp = new UnicodeSet("[^[:C:][:Ideographic:][:Hang:]]");
+        // Main Unihan block.
+        int LARGE_START = 0x4E00;
+        int LARGE_END = 0x9FFF;
+
+        int OPTIONS[] = {
+            UnicodeSet.CASE_INSENSITIVE, UnicodeSet.ADD_CASE_MAPPINGS,
+            UnicodeSet.SIMPLE_CASE_INSENSITIVE
+        };
+        UnicodeSet input = new UnicodeSet(), small, large;
+        for (int option : OPTIONS) {
+            UnicodeSetIterator iter = new UnicodeSetIterator(manyCp);
+            while (iter.next()) {
+                int c = iter.codepoint;
+                input.clear().add(c);
+                small = (UnicodeSet) input.clone();
+                small.closeOver(option);
+                large = (UnicodeSet) input.clone();
+                large.add(LARGE_START, LARGE_END);
+                large.closeOver(option);
+                large.remove(LARGE_START, LARGE_END);
+                if (!checkEqual(small, large, "small != large")) {
+                    errln("    option=" + option + " c=U+" + Utility.hex(c));
+                    break;
+                }
+            }
+        }
      }
  
      @Test
@@ -1709,8 +1914,8 @@ public class UnicodeSetTest extends TestFmwk {
              test2.add("a" + (max - i)); // add in reverse order
          }
          assertNotEquals("compare iterable test", test1, test2);
-        TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1);
-        TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2);
+        TreeSet<CharSequence> sortedTest1 = new TreeSet<>(test1);
+        TreeSet<CharSequence> sortedTest2 = new TreeSet<>(test2);
          assertEquals("compare iterable test", sortedTest1, sortedTest2);
      }
author	Markus Scherer <markus.icu@gmail.com>
	Thu, 2 Mar 2023 00:25:11 +0000 (00:25 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 2 Mar 2023 16:12:57 +0000 (08:12 -0800)
icu4c/source/common/characterproperties.cpp		patch \| blob \| history
icu4c/source/common/ucase.cpp		patch \| blob \| history
icu4c/source/common/ucase.h		patch \| blob \| history
icu4c/source/common/unicode/uniset.h		patch \| blob \| history
icu4c/source/common/unicode/uset.h		patch \| blob \| history
icu4c/source/common/uniset_closure.cpp		patch \| blob \| history
icu4c/source/common/uniset_props.cpp		patch \| blob \| history
icu4c/source/common/uprops.h		patch \| blob \| history
icu4c/source/common/uset_imp.h		patch \| blob \| history
icu4c/source/test/intltest/usettest.cpp		patch \| blob \| history
icu4c/source/test/intltest/usettest.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java		patch \| blob \| history