From: Markus Scherer Date: Thu, 2 Mar 2023 00:25:11 +0000 (+0000) Subject: ICU-6065 UnicodeSet::closeOver(simple case folding) X-Git-Tag: cldr/2023-03-13~13 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=79ab90b5f9373935b8ae1308cf1148fdc1a04d7b;p=icu ICU-6065 UnicodeSet::closeOver(simple case folding) See #2322 --- diff --git a/icu4c/source/common/characterproperties.cpp b/icu4c/source/common/characterproperties.cpp index 470e050479f..978e6761cee 100644 --- a/icu4c/source/common/characterproperties.cpp +++ b/icu4c/source/common/characterproperties.cpp @@ -377,22 +377,30 @@ UCPMap *makeMap(UProperty property, UErrorCode &errorCode) { } // namespace -U_NAMESPACE_USE +U_NAMESPACE_BEGIN -U_CAPI const USet * U_EXPORT2 -u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) { - if (U_FAILURE(*pErrorCode)) { return nullptr; } +const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } if (property < 0 || UCHAR_BINARY_LIMIT <= property) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + errorCode = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } Mutex m(&cpMutex); UnicodeSet *set = sets[property]; if (set == nullptr) { - sets[property] = set = makeSet(property, *pErrorCode); + sets[property] = set = makeSet(property, errorCode); } - if (U_FAILURE(*pErrorCode)) { return nullptr; } - return set->toUSet(); + return set; +} + +U_NAMESPACE_END + +U_NAMESPACE_USE + +U_CAPI const USet * U_EXPORT2 +u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) { + const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode); + return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr; } U_CAPI const UCPMap * U_EXPORT2 diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp index de5e046fb03..392e1266ae4 100644 --- a/icu4c/source/common/ucase.cpp +++ b/icu4c/source/common/ucase.cpp @@ -205,37 +205,7 @@ static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 }; U_CFUNC void U_EXPORT2 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { - uint16_t props; - - /* - * Hardcode the case closure of i and its relatives and ignore the - * data file data for these characters. - * The Turkic dotless i and dotted I with their case mapping conditions - * and case folding option make the related characters behave specially. - * This code matches their closure behavior to their case folding behavior. - */ - - switch(c) { - case 0x49: - /* regular i and I are in one equivalence class */ - sa->add(sa->set, 0x69); - return; - case 0x69: - sa->add(sa->set, 0x49); - return; - case 0x130: - /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ - sa->addString(sa->set, iDot, 2); - return; - case 0x131: - /* dotless i is in a class by itself */ - return; - default: - /* otherwise use the data file data */ - break; - } - - props=UTRIE2_GET16(&ucase_props_singleton.trie, c); + uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)!=UCASE_NONE) { /* add the one simple case mapping, no matter what type it is */ @@ -249,19 +219,42 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { * c has exceptions, so there may be multiple simple and/or * full case mappings. Add them all. */ - const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); - const char16_t *closure; + const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; - int32_t idx, closureLength, fullLength, length; - - pe0=pe; + const uint16_t *pe0=pe; + + // Hardcode the case closure of i and its relatives and ignore the + // data file data for these characters. + // The Turkic dotless i and dotted I with their case mapping conditions + // and case folding option make the related characters behave specially. + // This code matches their closure behavior to their case folding behavior. + if (excWord&UCASE_EXC_CONDITIONAL_FOLD) { + // These characters have Turkic case foldings. Hardcode their closure. + if (c == 0x49) { + // Regular i and I are in one equivalence class. + sa->add(sa->set, 0x69); + return; + } else if (c == 0x130) { + // Dotted I is in a class with <0069 0307> + // (for canonical equivalence with <0049 0307>). + sa->addString(sa->set, iDot, 2); + return; + } + } else if (c == 0x69) { + sa->add(sa->set, 0x49); + return; + } else if (c == 0x131) { + // Dotless i is in a class by itself. + return; + } /* add all simple case mappings */ - for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { + for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { if(HAS_SLOT(excWord, idx)) { pe=pe0; - GET_SLOT_VALUE(excWord, idx, pe, c); - sa->add(sa->set, c); + UChar32 mapping; + GET_SLOT_VALUE(excWord, idx, pe, mapping); + sa->add(sa->set, mapping); } } if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { @@ -272,6 +265,8 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { } /* get the closure string pointer & length */ + const char16_t *closure; + int32_t closureLength; if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { pe=pe0; GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); @@ -285,6 +280,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { /* add the full case folding */ if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { pe=pe0; + int32_t fullLength; GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); /* start of full case mapping strings */ @@ -297,7 +293,7 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { fullLength>>=4; /* add the full case folding string */ - length=fullLength&0xf; + int32_t length=fullLength&0xf; if(length!=0) { sa->addString(sa->set, (const char16_t *)pe, length); pe+=length; @@ -313,9 +309,146 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { } /* add each code point in the closure string */ - for(idx=0; idxadd(sa->set, c); + for(int32_t idx=0; idxadd(sa->set, mapping); + } + } +} + +namespace { + +/** + * Add the simple case closure mapping, + * except if there is not actually an scf relationship between the two characters. + * TODO: Unicode should probably add the corresponding scf mappings. + * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23. + * If & when those scf mappings are added, we should be able to remove all of these exceptions. + */ +void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) { + switch (c) { + case 0x0390: + if (t == 0x1FD3) { return; } + break; + case 0x03B0: + if (t == 0x1FE3) { return; } + break; + case 0x1FD3: + if (t == 0x0390) { return; } + break; + case 0x1FE3: + if (t == 0x03B0) { return; } + break; + case 0xFB05: + if (t == 0xFB06) { return; } + break; + case 0xFB06: + if (t == 0xFB05) { return; } + break; + default: + break; + } + sa->add(sa->set, t); +} + +} // namespace + +U_CFUNC void U_EXPORT2 +ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) { + uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); + if(!UCASE_HAS_EXCEPTION(props)) { + if(UCASE_GET_TYPE(props)!=UCASE_NONE) { + /* add the one simple case mapping, no matter what type it is */ + int32_t delta=UCASE_GET_DELTA(props); + if(delta!=0) { + sa->add(sa->set, c+delta); + } + } + } else { + // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding. + const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); + uint16_t excWord=*pe++; + const uint16_t *pe0=pe; + + // Hardcode the case closure of i and its relatives and ignore the + // data file data for these characters, like in ucase_addCaseClosure(). + if (excWord&UCASE_EXC_CONDITIONAL_FOLD) { + // These characters have Turkic case foldings. Hardcode their closure. + if (c == 0x49) { + // Regular i and I are in one equivalence class. + sa->add(sa->set, 0x69); + return; + } else if (c == 0x130) { + // For scf=Simple_Case_Folding, dotted I is in a class by itself. + return; + } + } else if (c == 0x69) { + sa->add(sa->set, 0x49); + return; + } else if (c == 0x131) { + // Dotless i is in a class by itself. + return; + } + + // Add all simple case mappings. + for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { + if(HAS_SLOT(excWord, idx)) { + pe=pe0; + UChar32 mapping; + GET_SLOT_VALUE(excWord, idx, pe, mapping); + addOneSimpleCaseClosure(c, mapping, sa); + } + } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { + pe=pe0; + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + addOneSimpleCaseClosure(c, mapping, sa); + } + + /* get the closure string pointer & length */ + const char16_t *closure; + int32_t closureLength; + if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { + pe=pe0; + GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); + closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ + closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */ + } else { + closureLength=0; + closure=nullptr; + } + + // Skip the full case mappings. + if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { + pe=pe0; + int32_t fullLength; + GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); + + /* start of full case mapping strings */ + ++pe; + + fullLength&=0xffff; /* bits 16 and higher are reserved */ + + // Skip all 4 full case mappings. + pe+=fullLength&UCASE_FULL_LOWER; + fullLength>>=4; + pe+=fullLength&0xf; + fullLength>>=4; + pe+=fullLength&0xf; + fullLength>>=4; + pe+=fullLength; + + closure=(const char16_t *)pe; /* behind full case mappings */ + } + + // Add each code point in the closure string whose scf maps back to c. + for(int32_t idx=0; idxU_ILLEGAL_ARGUMENT_ERROR if the pattern @@ -450,7 +452,9 @@ public: * @param pos on input, the position in pattern at which to start parsing. * On output, the position after the last character parsed. * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * Valid options are USET_IGNORE_SPACE and + * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. * @param symbols a symbol table mapping variable names to values * and stand-in characters to UnicodeSets; may be nullptr * @param status input-output error code @@ -645,7 +649,9 @@ public: * A frozen set will not be modified. * @param pattern a string specifying what characters are in the set * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * Valid options are USET_IGNORE_SPACE and + * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. * @param symbols a symbol table mapping variable names to * values and stand-ins to UnicodeSets; may be nullptr * @param status returns U_ILLEGAL_ARGUMENT_ERROR if the pattern @@ -683,7 +689,9 @@ public: * pattern.length() if the closing ']' is the last character of * the pattern string. * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * Valid options are USET_IGNORE_SPACE and + * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. * @param symbols a symbol table mapping variable names to * values and stand-ins to UnicodeSets; may be nullptr * @param status returns U_ILLEGAL_ARGUMENT_ERROR if the pattern @@ -1390,7 +1398,7 @@ public: /** * Close this set over the given attribute. For the attribute - * USET_CASE, the result is to modify this set so that: + * USET_CASE_INSENSITIVE, the result is to modify this set so that: * * 1. For each character or string 'a' in this set, all strings or * characters 'b' such that foldCase(a) == foldCase(b) are added @@ -1408,8 +1416,10 @@ public: * A frozen set will not be modified. * * @param attribute bitmask for attributes to close over. - * Currently only the USET_CASE bit is supported. Any undefined bits - * are ignored. + * Valid options: + * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. + * Unrelated options bits are ignored. * @return a reference to this set. * @stable ICU 4.2 */ @@ -1579,6 +1589,9 @@ private: int32_t depth, UErrorCode& ec); + void closeOverCaseInsensitive(bool simple); + void closeOverAddCaseMappings(); + //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index 5dd890e148d..ee4e0036d22 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -53,6 +53,12 @@ typedef struct USet USet; /** * Bitmask values to be passed to uset_openPatternOptions() or * uset_applyPattern() taking an option parameter. + * + * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. + * + * Undefined options bits are ignored, and reserved for future use. + * * @stable ICU 2.4 */ enum { @@ -60,13 +66,13 @@ enum { * Ignore white space within patterns unless quoted or escaped. * @stable ICU 2.4 */ - USET_IGNORE_SPACE = 1, + USET_IGNORE_SPACE = 1, /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This performs a full - * closure over case mappings, e.g. U+017F for s. + * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. * * The resulting set is a superset of the input for the code points but * not for the strings. @@ -88,17 +94,36 @@ enum { * * @stable ICU 2.4 */ - USET_CASE_INSENSITIVE = 2, + USET_CASE_INSENSITIVE = 2, /** - * Enable case insensitive matching. E.g., "[ab]" with this flag - * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will - * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, - * title-, and uppercase mappings as well as the case folding + * Adds all case mappings for each element in the set. + * This adds the full lower-, title-, and uppercase mappings as well as the full case folding * of each existing element in the set. + * + * Unlike the “case insensitive” options, this does not perform a closure. + * For example, it does not add 'ſ' (U+017F long s) for 's', + * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. + * * @stable ICU 3.2 */ - USET_ADD_CASE_MAPPINGS = 4 + USET_ADD_CASE_MAPPINGS = 4, + +#ifndef U_HIDE_DRAFT_API + /** + * Enable case insensitive matching. + * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings, + * which map each code point to one code point, + * not full Case_Folding (cf) mappings, which map some code points to multiple code points. + * + * This is designed for case-insensitive matches, for example in certain + * regular expression implementations where only Simple_Case_Folding mappings are used, + * such as in ECMAScript (JavaScript) regular expressions. + * + * @draft ICU 73 + */ + USET_SIMPLE_CASE_INSENSITIVE = 6 +#endif // U_HIDE_DRAFT_API }; /** @@ -299,7 +324,9 @@ uset_openPattern(const UChar* pattern, int32_t patternLength, * @param patternLength the length of the pattern, or -1 if null * terminated * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * Valid options are USET_IGNORE_SPACE and + * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. * @param ec the error code * @stable ICU 2.4 */ @@ -414,7 +441,10 @@ uset_set(USet* set, * The character at pattern[0] must be a '['. * @param patternLength The length of the UChar string. -1 if NUL terminated. * @param options A bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. + * Valid options are USET_IGNORE_SPACE and + * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, + * USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. * @param status Returns an error if the pattern cannot be parsed. * @return Upon successful parse, the value is either * the index of the character after the closing ']' @@ -804,7 +834,7 @@ uset_clear(USet* set); /** * Close this set over the given attribute. For the attribute - * USET_CASE, the result is to modify this set so that: + * USET_CASE_INSENSITIVE, the result is to modify this set so that: * * 1. For each character or string 'a' in this set, all strings or * characters 'b' such that foldCase(a) == foldCase(b) are added @@ -824,8 +854,10 @@ uset_clear(USet* set); * @param set the set * * @param attributes bitmask for attributes to close over. - * Currently only the USET_CASE bit is supported. Any undefined bits - * are ignored. + * Valid options: + * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. + * These case options are mutually exclusive. + * Unrelated options bits are ignored. * @stable ICU 4.2 */ U_CAPI void U_EXPORT2 diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index 1f1fbcf9f8e..173a5cbaaef 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -25,9 +25,11 @@ #include "unicode/locid.h" #include "unicode/parsepos.h" #include "unicode/uniset.h" +#include "unicode/utf16.h" #include "cmemory.h" #include "ruleiter.h" #include "ucase.h" +#include "uprops.h" #include "util.h" #include "uvector.h" @@ -149,102 +151,208 @@ addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeStr // see ucase.h } +namespace { + +/** For case closure on a large set, look only at code points with relevant properties. */ +const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) { + // The subset must have been constructed with all code points, + // so that the retainAll() intersection effectively copies all single code points from src. + U_ASSERT(subset.contains(0, 0x10ffff)); + if (src.size() < 30) { + return src; + } + // Return the intersection of the src code points with Case_Sensitive ones. + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet *sensitive = + CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode); + if (U_FAILURE(errorCode)) { + return src; + } + // Start by copying the "smaller" set. + // (We "copy" by intersecting all Unicode *code points* with the first set, + // which omits any strings.) + if (src.getRangeCount() > sensitive->getRangeCount()) { + subset.retainAll(*sensitive); + subset.retainAll(src); + } else { + subset.retainAll(src); + subset.retainAll(*sensitive); + } + return subset; +} + +// Per-character scf = Simple_Case_Folding of a string. +// (Normally when we case-fold a string we use full case foldings.) +bool scfString(const UnicodeString &s, UnicodeString &scf) { + // Iterate over the raw buffer for best performance. + const char16_t *p = s.getBuffer(); + int32_t length = s.length(); + // Loop while not needing modification. + for (int32_t i = 0; i < length;) { + UChar32 c; + U16_NEXT(p, i, length, c); // post-increments i + UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT); + if (scfChar != c) { + // Copy the characters before c. + scf.setTo(p, i - U16_LENGTH(c)); + // Loop over the rest of the string and keep case-folding. + for (;;) { + scf.append(scfChar); + if (i == length) { + return true; + } + U16_NEXT(p, i, length, c); // post-increments i + scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT); + } + } + } + return false; +} + +} // namespace + UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { if (isFrozen() || isBogus()) { return *this; } - if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { - { - UnicodeSet foldSet(*this); - UnicodeString str; - USetAdder sa = { - foldSet.toUSet(), - _set_add, - _set_addRange, - _set_addString, - nullptr, // don't need remove() - nullptr // don't need removeRange() - }; - - // start with input set to guarantee inclusion - // USET_CASE: remove strings because the strings will actually be reduced (folded); - // therefore, start with no strings and add only those needed - if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { - foldSet.strings->removeAllElements(); - } + switch (attribute & USET_CASE_MASK) { + case 0: + break; + case USET_CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ false); + break; + case USET_ADD_CASE_MAPPINGS: + closeOverAddCaseMappings(); + break; + case USET_SIMPLE_CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ true); + break; + default: + // bad option (unreachable) + break; + } + return *this; +} + +void UnicodeSet::closeOverCaseInsensitive(bool simple) { + // Start with input set to guarantee inclusion. + UnicodeSet foldSet(*this); + // Full case mappings closure: + // Remove strings because the strings will actually be reduced (folded); + // therefore, start with no strings and add only those needed. + // Do this before processing code points, because they may add strings. + if (!simple && foldSet.hasStrings()) { + foldSet.strings->removeAllElements(); + } + + USetAdder sa = { + foldSet.toUSet(), + _set_add, + _set_addRange, + _set_addString, + nullptr, // don't need remove() + nullptr // don't need removeRange() + }; + + UnicodeSet subset(0, 0x10ffff); + const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset); - int32_t n = getRangeCount(); - UChar32 result; - const char16_t *full; - - for (int32_t i=0; isize(); ++j) { + const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j); + if (simple) { + if (scfString(*pStr, str)) { + foldSet.remove(*pStr).add(str); + } + } else { + str = *pStr; + str.foldCase(); + if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) { + foldSet.add(str); // does not map to code points: add the folded string itself } } - if (hasStrings()) { - if (attribute & USET_CASE_INSENSITIVE) { - for (int32_t j=0; jsize(); ++j) { - str = *(const UnicodeString *) strings->elementAt(j); - str.foldCase(); - if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) { - foldSet.add(str); // does not map to code points: add the folded string itself - } - } - } else { - Locale root(""); -#if !UCONFIG_NO_BREAK_ITERATION - UErrorCode status = U_ZERO_ERROR; - BreakIterator *bi = BreakIterator::createWordInstance(root, status); - if (U_SUCCESS(status)) { -#endif - const UnicodeString *pStr; + } + } + *this = foldSet; +} + +void UnicodeSet::closeOverAddCaseMappings() { + // Start with input set to guarantee inclusion. + UnicodeSet foldSet(*this); + + UnicodeSet subset(0, 0x10ffff); + const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset); - for (int32_t j=0; jsize(); ++j) { - pStr = (const UnicodeString *) strings->elementAt(j); - (str = *pStr).toLower(root); - foldSet.add(str); + // Iterate over the ranges of single code points. Nested loop for each code point. + int32_t n = codePoints.getRangeCount(); + UChar32 result; + const char16_t *full; + UnicodeString str; + + for (int32_t i=0; isize(); ++j) { + const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j); + (str = *pStr).toLower(root); + foldSet.add(str); #if !UCONFIG_NO_BREAK_ITERATION - } - delete bi; + (str = *pStr).toTitle(bi, root); + foldSet.add(str); #endif - } + (str = *pStr).toUpper(root); + foldSet.add(str); + (str = *pStr).foldCase(); + foldSet.add(str); } - *this = foldSet; +#if !UCONFIG_NO_BREAK_ITERATION } + delete bi; +#endif } - return *this; + *this = foldSet; } U_NAMESPACE_END diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 3f6a154f802..bb6ce27444c 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -631,11 +631,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, * to close over case BEFORE COMPLEMENTING. This makes * patterns like /[^abc]/i work. */ - if ((options & USET_CASE_INSENSITIVE) != 0) { - (this->*caseClosure)(USET_CASE_INSENSITIVE); - } - else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { - (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); + if ((options & USET_CASE_MASK) != 0) { + (this->*caseClosure)(options); } if (invert) { complement().removeAllStrings(); // code point complement diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index 2004394db64..1e06d035192 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -441,6 +441,7 @@ class CharacterProperties { public: CharacterProperties() = delete; static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); + static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode); }; // implemented in uniset_props.cpp diff --git a/icu4c/source/common/uset_imp.h b/icu4c/source/common/uset_imp.h index 7233b9303c3..77197aaed77 100644 --- a/icu4c/source/common/uset_imp.h +++ b/icu4c/source/common/uset_imp.h @@ -58,5 +58,14 @@ typedef struct USetAdder USetAdder; U_CDECL_END -#endif +#ifdef __cplusplus + +namespace { + +constexpr int32_t USET_CASE_MASK = USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS; +} // namespace + +#endif // __cplusplus + +#endif diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 5bf3ab12f49..2ee67e93a4a 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -14,6 +14,7 @@ #include #include +#include #include "unicode/utypes.h" #include "usettest.h" #include "unicode/ucnv.h" @@ -85,6 +86,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestStrings); TESTCASE_AUTO(Testj2268); TESTCASE_AUTO(TestCloseOver); + TESTCASE_AUTO(TestCloseOverSimpleCaseFolding); + TESTCASE_AUTO(TestCloseOverLargeSets); TESTCASE_AUTO(TestEscapePattern); TESTCASE_AUTO(TestInvalidCodePoint); TESTCASE_AUTO(TestSymbolTable); @@ -1243,27 +1246,38 @@ void UnicodeSetTest::TestIndexOf() { * Test closure API. */ void UnicodeSetTest::TestCloseOver() { - UErrorCode ec = U_ZERO_ERROR; - - char CASE[] = {(char)USET_CASE_INSENSITIVE}; - char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; - const char* DATA[] = { + static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE}; + static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS}; + static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE}; + static const char* DATA[] = { // selector, input, output CASE, "[aq\\u00DF{Bc}{bC}{Fi}]", "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 + SIMPLE_CASE_INSENSITIVE, + "[aq\\u00DF{Bc}{bC}{Fi}]", + "[aAqQ\\u00DF\\u1E9E{bc}{fi}]", + CASE, "[\\u01F1]", // 'DZ' "[\\u01F1\\u01F2\\u01F3]", + SIMPLE_CASE_INSENSITIVE, + "[\\u01F1]", // 'DZ' + "[\\u01F1\\u01F2\\u01F3]", + CASE, "[\\u1FB4]", "[\\u1FB4{\\u03AC\\u03B9}]", + SIMPLE_CASE_INSENSITIVE, + "[\\u1FB4]", + "[\\u1FB4]", + CASE, "[{F\\uFB01}]", - "[\\uFB03{ffi}]", + "[\\uFB03{ffi}]", CASE, // make sure binary search finds limits "[a\\uFF3A]", @@ -1271,6 +1285,10 @@ void UnicodeSetTest::TestCloseOver() { CASE, "[a-z]","[A-Za-z\\u017F\\u212A]", + + SIMPLE_CASE_INSENSITIVE, + "[a-z]","[A-Za-z\\u017F\\u212A]", + CASE, "[abc]","[A-Ca-c]", CASE, @@ -1311,7 +1329,7 @@ void UnicodeSetTest::TestCloseOver() { CASE_MAPPINGS, "[\\u01F1]", // 'DZ' "[\\u01F1\\u01F2\\u01F3]", - + CASE_MAPPINGS, "[a-z]", "[A-Za-z]", @@ -1326,6 +1344,8 @@ void UnicodeSetTest::TestCloseOver() { int32_t selector = DATA[i][0]; UnicodeString pat(DATA[i+1], -1, US_INV); UnicodeString exp(DATA[i+2], -1, US_INV); + + UErrorCode ec = U_ZERO_ERROR; s.applyPattern(pat, ec); s.closeOver(selector); t.applyPattern(exp, ec); @@ -1341,68 +1361,8 @@ void UnicodeSetTest::TestCloseOver() { } } -#if 0 - /* - * Unused test code. - * This was used to compare the old implementation (using USET_CASE) - * with the new one (using 0x100 temporarily) - * while transitioning from hardcoded case closure tables in uniset.cpp - * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu. - * and using ucase.c functions for closure. - * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file - * - * Note: The old and new implementation never fully matched because - * the old implementation turned out to not map U+0130 and U+0131 correctly - * (dotted I and dotless i) and because the old implementation's data tables - * were outdated compared to Unicode 4.0.1 at the time of the change to the - * new implementation. (So sigmas and some other characters were not handled - * according to the newer Unicode version.) - */ - UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2; - UnicodeSetIterator si(sens); - UnicodeString str, buf2; - const UnicodeString *pStr; - UChar32 c; - while(si.next()) { - if(!si.isString()) { - c=si.getCodepoint(); - s.clear(); - s.add(c); - - str.setTo(c); - str.foldCase(); - sens2.add(str); - - t=s; - s.closeOver(USET_CASE); - t.closeOver(0x100); - if(s!=t) { - errln("FAIL: closeOver(U+%04x) differs: ", c); - errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true)); - } - } - } - // remove all code points - // should contain all full case folding mapping strings - sens2.remove(0, 0x10ffff); - si.reset(sens2); - while(si.next()) { - if(si.isString()) { - pStr=&si.getString(); - s.clear(); - s.add(*pStr); - t=s2=s; - s.closeOver(USET_CASE); - t.closeOver(0x100); - if(s!=t) { - errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, true)+") differs: "); - errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true)); - } - } - } -#endif - // Test the pattern API + UErrorCode ec = U_ZERO_ERROR; s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec); if (U_FAILURE(ec)) { errln("FAIL: applyPattern failed"); @@ -1423,6 +1383,123 @@ void UnicodeSetTest::TestCloseOver() { } } +namespace { + +void addIfAbsent(const std::unordered_multimap &closure, UChar32 c, UChar32 t, + std::unordered_multimap &additions) { + for (auto it = closure.find(c);; ++it) { + if (it == closure.end() || it->first != c) { + // absent + additions.insert({c, t}); + break; + } else if (it->second == t) { + // present + break; + } + } +} + +} // namespace + +void UnicodeSetTest::TestCloseOverSimpleCaseFolding() { + IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding"); + const UnicodeSet *sensitive = + UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode)); + if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) { + return; + } + // Compute the scf=Simple_Case_Folding closure: + // For each scf(c)=t, start with mappings c->t and t->c. + std::unordered_multimap closure; + UnicodeSetIterator iter(*sensitive); + while (iter.next()) { + UChar32 c = iter.getCodepoint(); + UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT); + if (scfChar != c) { + closure.insert({c, scfChar}); + closure.insert({scfChar, c}); + } + } + // Complete the closure: Add mappings of mappings. + for (;;) { + std::unordered_multimap additions; + // for each mapping c->t + for (auto mapping : closure) { + UChar32 c = mapping.first; + UChar32 t = mapping.second; + // enumerate each t->u + for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) { + UChar32 u = it->second; + if (u != c) { + addIfAbsent(closure, c, u, additions); + addIfAbsent(closure, u, c, additions); + } + } + } + if (additions.empty()) { + break; // The closure is complete. + } + closure.insert(additions.begin(), additions.end()); + } + // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation. + // Here we focus on single code points as input. + // Other examples, including strings, are tested in TestCloseOver(). + int32_t errors = 0; + iter.reset(); + UnicodeSet set, expected; + while (iter.next()) { + UChar32 c = iter.getCodepoint(); + // closeOver() + set.clear().add(c); + set.closeOver(USET_SIMPLE_CASE_INSENSITIVE); + // From-first-principles implementation. + expected.clear().add(c); + for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) { + expected.add(it->second); + } + // compare + if (!checkEqual(expected, set, "closeOver() vs. test impl")) { + errln(" c=U+%04X", c); + if (++errors == 10) { + break; + } + } + } +} + +void UnicodeSetTest::TestCloseOverLargeSets() { + IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets"); + // Check that an optimization for large sets does not change the result. + + // Most code points except ones that are boring for case mappings. + UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode); + // Main Unihan block. + constexpr UChar32 LARGE_START = 0x4E00; + constexpr UChar32 LARGE_END = 0x9FFF; + + static constexpr int32_t OPTIONS[] = { + USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE + }; + UnicodeSet input, small, large; + for (int32_t option : OPTIONS) { + UnicodeSetIterator iter(manyCp); + while (iter.next()) { + UChar32 c = iter.getCodepoint(); + input.clear().add(c); + small = input; + small.closeOver(option); + large = input; + large.add(LARGE_START, LARGE_END); + large.closeOver(option); + large.remove(LARGE_START, LARGE_END); + if (!checkEqual(small, large, "small != large")) { + errln(" option=%d c=U+%04X", option, c); + break; + } + } + } +} + void UnicodeSetTest::TestEscapePattern() { const char pattern[] = "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 9271edbb04e..3cb5dc14e8a 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -74,6 +74,8 @@ private: void TestExhaustive(void); void TestCloseOver(void); + void TestCloseOverSimpleCaseFolding(); + void TestCloseOverLargeSets(); void TestEscapePattern(void); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java index fced2b62986..aee590f4911 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java @@ -260,34 +260,6 @@ public final class UCaseProps { * - for k include the Kelvin sign */ public final void addCaseClosure(int c, UnicodeSet set) { - /* - * Hardcode the case closure of i and its relatives and ignore the - * data file data for these characters. - * The Turkic dotless i and dotted I with their case mapping conditions - * and case folding option make the related characters behave specially. - * This code matches their closure behavior to their case folding behavior. - */ - - switch(c) { - case 0x49: - /* regular i and I are in one equivalence class */ - set.add(0x69); - return; - case 0x69: - set.add(0x49); - return; - case 0x130: - /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ - set.add(iDot); - return; - case 0x131: - /* dotless i is in a class by itself */ - return; - default: - /* otherwise use the data file data */ - break; - } - int props=trie.get(c); if(!propsHasException(props)) { if(getTypeFromProps(props)!=NONE) { @@ -302,19 +274,41 @@ public final class UCaseProps { * c has exceptions, so there may be multiple simple and/or * full case mappings. Add them all. */ - int excOffset0, excOffset=getExceptionsOffset(props); - int closureOffset; + int excOffset=getExceptionsOffset(props); int excWord=exceptions.charAt(excOffset++); - int index, closureLength, fullLength, length; - - excOffset0=excOffset; + int excOffset0=excOffset; + + // Hardcode the case closure of i and its relatives and ignore the + // data file data for these characters. + // The Turkic dotless i and dotted I with their case mapping conditions + // and case folding option make the related characters behave specially. + // This code matches their closure behavior to their case folding behavior. + if ((excWord&EXC_CONDITIONAL_FOLD) != 0) { + // These characters have Turkic case foldings. Hardcode their closure. + if (c == 0x49) { + // Regular i and I are in one equivalence class. + set.add(0x69); + return; + } else if (c == 0x130) { + // Dotted I is in a class with <0069 0307> + // (for canonical equivalence with <0049 0307>). + set.add(iDot); + return; + } + } else if (c == 0x69) { + set.add(0x49); + return; + } else if (c == 0x131) { + // Dotless i is in a class by itself. + return; + } /* add all simple case mappings */ - for(index=EXC_LOWER; index<=EXC_TITLE; ++index) { + for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) { if(hasSlot(excWord, index)) { excOffset=excOffset0; - c=getSlotValue(excWord, index, excOffset); - set.add(c); + int mapping=getSlotValue(excWord, index, excOffset); + set.add(mapping); } } if(hasSlot(excWord, EXC_DELTA)) { @@ -324,6 +318,7 @@ public final class UCaseProps { } /* get the closure string pointer & length */ + int closureOffset, closureLength; if(hasSlot(excWord, EXC_CLOSURE)) { excOffset=excOffset0; long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset); @@ -338,7 +333,7 @@ public final class UCaseProps { if(hasSlot(excWord, EXC_FULL_MAPPINGS)) { excOffset=excOffset0; long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); - fullLength=(int)value; + int fullLength=(int)value; /* start of full case mapping strings */ excOffset=(int)(value>>32)+1; @@ -350,7 +345,7 @@ public final class UCaseProps { fullLength>>=4; /* add the full case folding string */ - length=fullLength&0xf; + int length=fullLength&0xf; if(length!=0) { set.add(exceptions.substring(excOffset, excOffset+length)); excOffset+=length; @@ -367,9 +362,137 @@ public final class UCaseProps { /* add each code point in the closure string */ int limit=closureOffset+closureLength; - for(index=closureOffset; index>32)+1; /* behind this slot, unless there are full case mappings */ + } else { + closureLength=0; + closureOffset=0; + } + + // Skip the full case mappings. + if(closureLength > 0 && hasSlot(excWord, EXC_FULL_MAPPINGS)) { + excOffset=excOffset0; + long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); + int fullLength=(int)value; + + /* start of full case mapping strings */ + excOffset=(int)(value>>32)+1; + + fullLength&=0xffff; /* bits 16 and higher are reserved */ + + // Skip all 4 full case mappings. + excOffset+=fullLength&FULL_LOWER; + fullLength>>=4; + excOffset+=fullLength&0xf; + fullLength>>=4; + excOffset+=fullLength&0xf; + fullLength>>=4; + excOffset+=fullLength; + + closureOffset=excOffset; /* behind full case mappings */ + } + + // Add each code point in the closure string whose scf maps back to c. + int limit=closureOffset+closureLength; + for(int index=closureOffset; index, Compa * for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern contains * a syntax error. * @stable ICU 3.8 @@ -495,7 +497,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @param symbols a symbol table mapping variables to char[] arrays * and chars to UnicodeSets * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern * contains a syntax error. * @stable ICU 3.2 @@ -587,7 +591,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * See the class description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param options a bitmask indicating which options to apply. - * Valid options are IGNORE_SPACE and CASE. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. * @exception java.lang.IllegalArgumentException if the pattern * contains a syntax error. * @stable ICU 3.8 @@ -2584,8 +2590,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * variables, or null if none. * @param rebuiltPat the pattern that was parsed, rebuilt or * copied from the input pattern, as appropriate. - * @param options a bit mask of zero or more of the following: - * IGNORE_SPACE, CASE. + * @param options a bit mask. + * Valid options are {@link #IGNORE_SPACE} and + * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. */ private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth) { @@ -2965,8 +2973,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * to close over case BEFORE COMPLEMENTING. This makes * patterns like /[^abc]/i work. */ - if ((options & CASE) != 0) { - closeOver(CASE); + if ((options & CASE_MASK) != 0) { + closeOver(options); } if (invert) { complement().removeAllStrings(); // code point complement @@ -3861,58 +3869,81 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa public static final int IGNORE_SPACE = 1; /** - * Bitmask for constructor, applyPattern(), and closeOver() - * indicating letter case. This may be ORed together with other - * selectors. + * Alias for {@link #CASE_INSENSITIVE}. * + * @stable ICU 3.8 + */ + public static final int CASE = 2; + + /** * Enable case insensitive matching. E.g., "[ab]" with this flag * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will * match all except 'a', 'A', 'b', and 'B'. This performs a full - * closure over case mappings, e.g. U+017F for s. + * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. * - * The resulting set is a superset of the input for the code points but + *

This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + *

The resulting set is a superset of the input for the code points but * not for the strings. * It performs a case mapping closure of the code points and adds * full case folding strings for the code points, and reduces strings of * the original set to their full case folding equivalents. * - * This is designed for case-insensitive matches, for example + *

This is designed for case-insensitive matches, for example * in regular expressions. The full code point case closure allows checking of * an input character directly against the closure set. * Strings are matched by comparing the case-folded form from the closure * set with an incremental case folding of the string in question. * - * The closure set will also contain single code points if the original + *

The closure set will also contain single code points if the original * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). * This is not necessary (that is, redundant) for the above matching method * but results in the same closure sets regardless of whether the original * set contained the code point or a string. - * @stable ICU 3.8 - */ - public static final int CASE = 2; - - /** - * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C - * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). - * @see #CASE + * * @stable ICU 3.4 */ public static final int CASE_INSENSITIVE = 2; /** - * Bitmask for constructor, applyPattern(), and closeOver() - * indicating letter case. This may be ORed together with other - * selectors. - * - * Enable case insensitive matching. E.g., "[ab]" with this flag - * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will - * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, - * title-, and uppercase mappings as well as the case folding + * Adds all case mappings for each element in the set. + * This adds the full lower-, title-, and uppercase mappings as well as the full case folding * of each existing element in the set. + * + *

This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + *

Unlike the “case insensitive” options, this does not perform a closure. + * For example, it does not add 'ſ' (U+017F long s) for 's', + * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. + * * @stable ICU 3.4 */ public static final int ADD_CASE_MAPPINGS = 4; + /** + * Enable case insensitive matching. + * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings, + * which map each code point to one code point, + * not full Case_Folding (cf) mappings, which map some code points to multiple code points. + * + *

This is designed for case-insensitive matches, for example in certain + * regular expression implementations where only Simple_Case_Folding mappings are used, + * such as in ECMAScript (JavaScript) regular expressions. + * + *

This value is an options bit set value for some + * constructors, applyPattern(), and closeOver(). + * It can be ORed together with other, unrelated options. + * + * @draft ICU 73 + */ + public static final int SIMPLE_CASE_INSENSITIVE = 6; + + private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS; + // add the result of a full case mapping to the set // use str as a temporary string to avoid constructing one private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { @@ -3930,97 +3961,191 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // see UCaseProps } + /** For case closure on a large set, look only at code points with relevant properties. */ + UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) { + if (src.size() < 30) { + return src; + } + // Return the intersection of the src code points with Case_Sensitive ones. + UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE); + // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src. + if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) { + return sensitive.cloneAsThawed().retainAll(src); + } else { + return ((UnicodeSet) src.clone()).retainAll(sensitive); + } + } + + // Per-character scf = Simple_Case_Folding of a string. + // (Normally when we case-fold a string we use full case foldings.) + private static final boolean scfString(CharSequence s, StringBuilder scf) { + int length = s.length(); + // Loop while not needing modification. + for (int i = 0; i < length;) { + int c = Character.codePointAt(s, i); + int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); + if (scfChar != c) { + // Copy the characters before c. + scf.setLength(0); + scf.append(s, 0, i); + // Loop over the rest of the string and keep case-folding. + for (;;) { + scf.appendCodePoint(scfChar); + i += Character.charCount(c); + if (i == length) { + return true; + } + c = Character.codePointAt(s, i); + scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); + } + } + i += Character.charCount(c); + } + return false; + } + /** * Close this set over the given attribute. For the attribute - * CASE, the result is to modify this set so that: + * {@link #CASE_INSENSITIVE}, the result is to modify this set so that: * - * 1. For each character or string 'a' in this set, all strings + *

    + *
  1. For each character or string 'a' in this set, all strings * 'b' such that foldCase(a) == foldCase(b) are added to this set. * (For most 'a' that are single characters, 'b' will have * b.length() == 1.) * - * 2. For each string 'e' in the resulting set, if e != + *
  2. For each string 'e' in the resulting set, if e != * foldCase(e), 'e' will be removed. + *
* - * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] + *

Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] * - * (Here foldCase(x) refers to the operation + *

(Here foldCase(x) refers to the operation * UCharacter.foldCase(x, true), and a == b actually denotes * a.equals(b), not pointer comparison.) * * @param attribute bitmask for attributes to close over. - * Currently only the CASE bit is supported. Any undefined bits - * are ignored. + * Valid options: + * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, + * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. + * Unrelated options bits are ignored. * @return a reference to this set. * @stable ICU 3.8 */ public UnicodeSet closeOver(int attribute) { checkFrozen(); - if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { - UCaseProps csp = UCaseProps.INSTANCE; - UnicodeSet foldSet = new UnicodeSet(this); - ULocale root = ULocale.ROOT; - - // start with input set to guarantee inclusion - // CASE: remove strings because the strings will actually be reduced (folded); - // therefore, start with no strings and add only those needed - if((attribute & CASE) != 0 && foldSet.hasStrings()) { - foldSet.strings.clear(); - } - - int n = getRangeCount(); - int result; - StringBuilder full = new StringBuilder(); + switch (attribute & CASE_MASK) { + case 0: + break; + case CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ false); + break; + case ADD_CASE_MAPPINGS: + closeOverAddCaseMappings(); + break; + case SIMPLE_CASE_INSENSITIVE: + closeOverCaseInsensitive(/* simple= */ true); + break; + default: + // bad option (unreachable) + break; + } + return this; + } - for (int i=0; i> closure, Integer c, Integer t) { + Collection values = closure.get(c); + if (values == null) { + values = new TreeSet<>(); + closure.put(c, values); + } + values.add(t); + } + + private void addIfAbsent(Map> closure, Integer c, Integer t, + Map> additions) { + Collection values = closure.get(c); + if (values == null || !values.contains(t)) { + if (additions != closure) { + values = additions.get(c); + } + if (values == null) { + values = new TreeSet<>(); + additions.put(c, values); + } + values.add(t); + } + } + + @Test + public void TestCloseOverSimpleCaseFolding() { + UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE); + // Compute the scf=Simple_Case_Folding closure: + // For each scf(c)=t, start with mappings c->t and t->c. + + // Poor man's multimap from code points to code points. + Map> closure = new HashMap<>(); + UnicodeSetIterator iter = new UnicodeSetIterator(sensitive); + while (iter.next()) { + int c = iter.codepoint; + int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); + if (scfChar != c) { + add(closure, c, scfChar); + add(closure, scfChar, c); + } + } + // Complete the closure: Add mappings of mappings. + Map> additions = new HashMap<>(); + for (;;) { + // for each mapping c->t + for (Map.Entry> entry : closure.entrySet()) { + Integer c = entry.getKey(); + Collection cValues = entry.getValue(); + for (Integer t : cValues) { + // enumerate each t->u + Collection tValues = closure.get(t); + if (tValues != null) { + for (Integer u : tValues) { + if (!u.equals(c)) { + addIfAbsent(closure, c, u, additions); + addIfAbsent(closure, u, c, additions); + } + } + } + } + + } + if (additions.isEmpty()) { + break; // The closure is complete. + } + // Add all of the additions back into the closure. + for (Map.Entry> entry : additions.entrySet()) { + Integer c = entry.getKey(); + Collection cValues = entry.getValue(); + Collection closureValues = closure.get(c); + if (closureValues == null) { + closureValues = new TreeSet<>(); + closure.put(c, closureValues); + } + closureValues.addAll(cValues); + } + additions.clear(); + } + // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation. + // Here we focus on single code points as input. + // Other examples, including strings, are tested in TestCloseOver(). + int errors = 0; + iter.reset(); + UnicodeSet set = new UnicodeSet(), expected = new UnicodeSet(); + while (iter.next()) { + int c = iter.codepoint; + // closeOver() + set.clear().add(c); + set.closeOver(UnicodeSet.SIMPLE_CASE_INSENSITIVE); + // From-first-principles implementation. + expected.clear().add(c); + Collection values = closure.get(c); + if (values != null) { + for (Integer t : values) { + expected.add(t); + } + } + // compare + if (!checkEqual(expected, set, "closeOver() vs. test impl")) { + errln(" c=U+" + Utility.hex(c)); + if (++errors == 10) { + break; + } + } + } + } + + @Test + public void TestCloseOverLargeSets() { + // Check that an optimization for large sets does not change the result. + + // Most code points except ones that are boring for case mappings. + UnicodeSet manyCp = new UnicodeSet("[^[:C:][:Ideographic:][:Hang:]]"); + // Main Unihan block. + int LARGE_START = 0x4E00; + int LARGE_END = 0x9FFF; + + int OPTIONS[] = { + UnicodeSet.CASE_INSENSITIVE, UnicodeSet.ADD_CASE_MAPPINGS, + UnicodeSet.SIMPLE_CASE_INSENSITIVE + }; + UnicodeSet input = new UnicodeSet(), small, large; + for (int option : OPTIONS) { + UnicodeSetIterator iter = new UnicodeSetIterator(manyCp); + while (iter.next()) { + int c = iter.codepoint; + input.clear().add(c); + small = (UnicodeSet) input.clone(); + small.closeOver(option); + large = (UnicodeSet) input.clone(); + large.add(LARGE_START, LARGE_END); + large.closeOver(option); + large.remove(LARGE_START, LARGE_END); + if (!checkEqual(small, large, "small != large")) { + errln(" option=" + option + " c=U+" + Utility.hex(c)); + break; + } + } + } } @Test @@ -1709,8 +1914,8 @@ public class UnicodeSetTest extends TestFmwk { test2.add("a" + (max - i)); // add in reverse order } assertNotEquals("compare iterable test", test1, test2); - TreeSet sortedTest1 = new TreeSet(test1); - TreeSet sortedTest2 = new TreeSet(test2); + TreeSet sortedTest1 = new TreeSet<>(test1); + TreeSet sortedTest2 = new TreeSet<>(test2); assertEquals("compare iterable test", sortedTest1, sortedTest2); }