} // namespace
-U_NAMESPACE_USE
+U_NAMESPACE_BEGIN
-U_CAPI const USet * U_EXPORT2
-u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) { return nullptr; }
+const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}
Mutex m(&cpMutex);
UnicodeSet *set = sets[property];
if (set == nullptr) {
- sets[property] = set = makeSet(property, *pErrorCode);
+ sets[property] = set = makeSet(property, errorCode);
}
- if (U_FAILURE(*pErrorCode)) { return nullptr; }
- return set->toUSet();
+ return set;
+}
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
+ const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
+ return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
}
U_CAPI const UCPMap * U_EXPORT2
U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
- uint16_t props;
-
- /*
- * Hardcode the case closure of i and its relatives and ignore the
- * data file data for these characters.
- * The Turkic dotless i and dotted I with their case mapping conditions
- * and case folding option make the related characters behave specially.
- * This code matches their closure behavior to their case folding behavior.
- */
-
- switch(c) {
- case 0x49:
- /* regular i and I are in one equivalence class */
- sa->add(sa->set, 0x69);
- return;
- case 0x69:
- sa->add(sa->set, 0x49);
- return;
- case 0x130:
- /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
- sa->addString(sa->set, iDot, 2);
- return;
- case 0x131:
- /* dotless i is in a class by itself */
- return;
- default:
- /* otherwise use the data file data */
- break;
- }
-
- props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
+ uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
* c has exceptions, so there may be multiple simple and/or
* full case mappings. Add them all.
*/
- const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
- const char16_t *closure;
+ const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
- int32_t idx, closureLength, fullLength, length;
-
- pe0=pe;
+ const uint16_t *pe0=pe;
+
+ // Hardcode the case closure of i and its relatives and ignore the
+ // data file data for these characters.
+ // The Turkic dotless i and dotted I with their case mapping conditions
+ // and case folding option make the related characters behave specially.
+ // This code matches their closure behavior to their case folding behavior.
+ if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+ // These characters have Turkic case foldings. Hardcode their closure.
+ if (c == 0x49) {
+ // Regular i and I are in one equivalence class.
+ sa->add(sa->set, 0x69);
+ return;
+ } else if (c == 0x130) {
+ // Dotted I is in a class with <0069 0307>
+ // (for canonical equivalence with <0049 0307>).
+ sa->addString(sa->set, iDot, 2);
+ return;
+ }
+ } else if (c == 0x69) {
+ sa->add(sa->set, 0x49);
+ return;
+ } else if (c == 0x131) {
+ // Dotless i is in a class by itself.
+ return;
+ }
/* add all simple case mappings */
- for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
+ for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
if(HAS_SLOT(excWord, idx)) {
pe=pe0;
- GET_SLOT_VALUE(excWord, idx, pe, c);
- sa->add(sa->set, c);
+ UChar32 mapping;
+ GET_SLOT_VALUE(excWord, idx, pe, mapping);
+ sa->add(sa->set, mapping);
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
}
/* get the closure string pointer & length */
+ const char16_t *closure;
+ int32_t closureLength;
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
/* add the full case folding */
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
pe=pe0;
+ int32_t fullLength;
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
/* start of full case mapping strings */
fullLength>>=4;
/* add the full case folding string */
- length=fullLength&0xf;
+ int32_t length=fullLength&0xf;
if(length!=0) {
sa->addString(sa->set, (const char16_t *)pe, length);
pe+=length;
}
/* add each code point in the closure string */
- for(idx=0; idx<closureLength;) {
- U16_NEXT_UNSAFE(closure, idx, c);
- sa->add(sa->set, c);
+ for(int32_t idx=0; idx<closureLength;) {
+ UChar32 mapping;
+ U16_NEXT_UNSAFE(closure, idx, mapping);
+ sa->add(sa->set, mapping);
+ }
+ }
+}
+
+namespace {
+
+/**
+ * Add the simple case closure mapping,
+ * except if there is not actually an scf relationship between the two characters.
+ * TODO: Unicode should probably add the corresponding scf mappings.
+ * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
+ * If & when those scf mappings are added, we should be able to remove all of these exceptions.
+ */
+void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
+ switch (c) {
+ case 0x0390:
+ if (t == 0x1FD3) { return; }
+ break;
+ case 0x03B0:
+ if (t == 0x1FE3) { return; }
+ break;
+ case 0x1FD3:
+ if (t == 0x0390) { return; }
+ break;
+ case 0x1FE3:
+ if (t == 0x03B0) { return; }
+ break;
+ case 0xFB05:
+ if (t == 0xFB06) { return; }
+ break;
+ case 0xFB06:
+ if (t == 0xFB05) { return; }
+ break;
+ default:
+ break;
+ }
+ sa->add(sa->set, t);
+}
+
+} // namespace
+
+U_CFUNC void U_EXPORT2
+ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
+ uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
+ if(!UCASE_HAS_EXCEPTION(props)) {
+ if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
+ /* add the one simple case mapping, no matter what type it is */
+ int32_t delta=UCASE_GET_DELTA(props);
+ if(delta!=0) {
+ sa->add(sa->set, c+delta);
+ }
+ }
+ } else {
+ // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
+ const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
+ uint16_t excWord=*pe++;
+ const uint16_t *pe0=pe;
+
+ // Hardcode the case closure of i and its relatives and ignore the
+ // data file data for these characters, like in ucase_addCaseClosure().
+ if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+ // These characters have Turkic case foldings. Hardcode their closure.
+ if (c == 0x49) {
+ // Regular i and I are in one equivalence class.
+ sa->add(sa->set, 0x69);
+ return;
+ } else if (c == 0x130) {
+ // For scf=Simple_Case_Folding, dotted I is in a class by itself.
+ return;
+ }
+ } else if (c == 0x69) {
+ sa->add(sa->set, 0x49);
+ return;
+ } else if (c == 0x131) {
+ // Dotless i is in a class by itself.
+ return;
+ }
+
+ // Add all simple case mappings.
+ for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
+ if(HAS_SLOT(excWord, idx)) {
+ pe=pe0;
+ UChar32 mapping;
+ GET_SLOT_VALUE(excWord, idx, pe, mapping);
+ addOneSimpleCaseClosure(c, mapping, sa);
+ }
+ }
+ if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
+ pe=pe0;
+ int32_t delta;
+ GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
+ UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
+ addOneSimpleCaseClosure(c, mapping, sa);
+ }
+
+ /* get the closure string pointer & length */
+ const char16_t *closure;
+ int32_t closureLength;
+ if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
+ pe=pe0;
+ GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
+ closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
+ closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
+ } else {
+ closureLength=0;
+ closure=nullptr;
+ }
+
+ // Skip the full case mappings.
+ if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
+ pe=pe0;
+ int32_t fullLength;
+ GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
+
+ /* start of full case mapping strings */
+ ++pe;
+
+ fullLength&=0xffff; /* bits 16 and higher are reserved */
+
+ // Skip all 4 full case mappings.
+ pe+=fullLength&UCASE_FULL_LOWER;
+ fullLength>>=4;
+ pe+=fullLength&0xf;
+ fullLength>>=4;
+ pe+=fullLength&0xf;
+ fullLength>>=4;
+ pe+=fullLength;
+
+ closure=(const char16_t *)pe; /* behind full case mappings */
+ }
+
+ // Add each code point in the closure string whose scf maps back to c.
+ for(int32_t idx=0; idx<closureLength;) {
+ UChar32 mapping;
+ U16_NEXT_UNSAFE(closure, idx, mapping);
+ addOneSimpleCaseClosure(c, mapping, sa);
}
}
}
U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
+/** Case closure with only scf=Simple_Case_Folding. */
+U_CFUNC void U_EXPORT2
+ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa);
+
/**
* Maps the string to single code points and adds the associated case closure
* mappings.
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* @param pos on input, the position in pattern at which to start parsing.
* On output, the position after the last character parsed.
* @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be nullptr
* @param status input-output error code
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* pattern.length() if the closing ']' is the last character of
* the pattern string.
* @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be nullptr
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
/**
* Close this set over the given attribute. For the attribute
- * USET_CASE, the result is to modify this set so that:
+ * USET_CASE_INSENSITIVE, the result is to modify this set so that:
*
* 1. For each character or string 'a' in this set, all strings or
* characters 'b' such that foldCase(a) == foldCase(b) are added
* A frozen set will not be modified.
*
* @param attribute bitmask for attributes to close over.
- * Currently only the USET_CASE bit is supported. Any undefined bits
- * are ignored.
+ * Valid options:
+ * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
+ * Unrelated options bits are ignored.
* @return a reference to this set.
* @stable ICU 4.2
*/
int32_t depth,
UErrorCode& ec);
+ void closeOverCaseInsensitive(bool simple);
+ void closeOverAddCaseMappings();
+
//----------------------------------------------------------------
// Implementation: Utility methods
//----------------------------------------------------------------
/**
* Bitmask values to be passed to uset_openPatternOptions() or
* uset_applyPattern() taking an option parameter.
+ *
+ * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
+ *
+ * Undefined options bits are ignored, and reserved for future use.
+ *
* @stable ICU 2.4
*/
enum {
* Ignore white space within patterns unless quoted or escaped.
* @stable ICU 2.4
*/
- USET_IGNORE_SPACE = 1,
+ USET_IGNORE_SPACE = 1,
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
- * closure over case mappings, e.g. U+017F for s.
+ * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
*
* The resulting set is a superset of the input for the code points but
* not for the strings.
*
* @stable ICU 2.4
*/
- USET_CASE_INSENSITIVE = 2,
+ USET_CASE_INSENSITIVE = 2,
/**
- * Enable case insensitive matching. E.g., "[ab]" with this flag
- * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
- * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
- * title-, and uppercase mappings as well as the case folding
+ * Adds all case mappings for each element in the set.
+ * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
* of each existing element in the set.
+ *
+ * Unlike the “case insensitive” options, this does not perform a closure.
+ * For example, it does not add 'ſ' (U+017F long s) for 's',
+ * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
+ *
* @stable ICU 3.2
*/
- USET_ADD_CASE_MAPPINGS = 4
+ USET_ADD_CASE_MAPPINGS = 4,
+
+#ifndef U_HIDE_DRAFT_API
+ /**
+ * Enable case insensitive matching.
+ * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
+ * which map each code point to one code point,
+ * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+ *
+ * This is designed for case-insensitive matches, for example in certain
+ * regular expression implementations where only Simple_Case_Folding mappings are used,
+ * such as in ECMAScript (JavaScript) regular expressions.
+ *
+ * @draft ICU 73
+ */
+ USET_SIMPLE_CASE_INSENSITIVE = 6
+#endif // U_HIDE_DRAFT_API
};
/**
* @param patternLength the length of the pattern, or -1 if null
* terminated
* @param options bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param ec the error code
* @stable ICU 2.4
*/
* The character at pattern[0] must be a '['.
* @param patternLength The length of the UChar string. -1 if NUL terminated.
* @param options A bitmask for options to apply to the pattern.
- * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+ * Valid options are USET_IGNORE_SPACE and
+ * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
+ * USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
* @param status Returns an error if the pattern cannot be parsed.
* @return Upon successful parse, the value is either
* the index of the character after the closing ']'
/**
* Close this set over the given attribute. For the attribute
- * USET_CASE, the result is to modify this set so that:
+ * USET_CASE_INSENSITIVE, the result is to modify this set so that:
*
* 1. For each character or string 'a' in this set, all strings or
* characters 'b' such that foldCase(a) == foldCase(b) are added
* @param set the set
*
* @param attributes bitmask for attributes to close over.
- * Currently only the USET_CASE bit is supported. Any undefined bits
- * are ignored.
+ * Valid options:
+ * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
+ * These case options are mutually exclusive.
+ * Unrelated options bits are ignored.
* @stable ICU 4.2
*/
U_CAPI void U_EXPORT2
#include "unicode/locid.h"
#include "unicode/parsepos.h"
#include "unicode/uniset.h"
+#include "unicode/utf16.h"
#include "cmemory.h"
#include "ruleiter.h"
#include "ucase.h"
+#include "uprops.h"
#include "util.h"
#include "uvector.h"
// see ucase.h
}
+namespace {
+
+/** For case closure on a large set, look only at code points with relevant properties. */
+const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) {
+ // The subset must have been constructed with all code points,
+ // so that the retainAll() intersection effectively copies all single code points from src.
+ U_ASSERT(subset.contains(0, 0x10ffff));
+ if (src.size() < 30) {
+ return src;
+ }
+ // Return the intersection of the src code points with Case_Sensitive ones.
+ UErrorCode errorCode = U_ZERO_ERROR;
+ const UnicodeSet *sensitive =
+ CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode);
+ if (U_FAILURE(errorCode)) {
+ return src;
+ }
+ // Start by copying the "smaller" set.
+ // (We "copy" by intersecting all Unicode *code points* with the first set,
+ // which omits any strings.)
+ if (src.getRangeCount() > sensitive->getRangeCount()) {
+ subset.retainAll(*sensitive);
+ subset.retainAll(src);
+ } else {
+ subset.retainAll(src);
+ subset.retainAll(*sensitive);
+ }
+ return subset;
+}
+
+// Per-character scf = Simple_Case_Folding of a string.
+// (Normally when we case-fold a string we use full case foldings.)
+bool scfString(const UnicodeString &s, UnicodeString &scf) {
+ // Iterate over the raw buffer for best performance.
+ const char16_t *p = s.getBuffer();
+ int32_t length = s.length();
+ // Loop while not needing modification.
+ for (int32_t i = 0; i < length;) {
+ UChar32 c;
+ U16_NEXT(p, i, length, c); // post-increments i
+ UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+ if (scfChar != c) {
+ // Copy the characters before c.
+ scf.setTo(p, i - U16_LENGTH(c));
+ // Loop over the rest of the string and keep case-folding.
+ for (;;) {
+ scf.append(scfChar);
+ if (i == length) {
+ return true;
+ }
+ U16_NEXT(p, i, length, c); // post-increments i
+ scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace
+
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
if (isFrozen() || isBogus()) {
return *this;
}
- if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
- {
- UnicodeSet foldSet(*this);
- UnicodeString str;
- USetAdder sa = {
- foldSet.toUSet(),
- _set_add,
- _set_addRange,
- _set_addString,
- nullptr, // don't need remove()
- nullptr // don't need removeRange()
- };
-
- // start with input set to guarantee inclusion
- // USET_CASE: remove strings because the strings will actually be reduced (folded);
- // therefore, start with no strings and add only those needed
- if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) {
- foldSet.strings->removeAllElements();
- }
+ switch (attribute & USET_CASE_MASK) {
+ case 0:
+ break;
+ case USET_CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ false);
+ break;
+ case USET_ADD_CASE_MAPPINGS:
+ closeOverAddCaseMappings();
+ break;
+ case USET_SIMPLE_CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ true);
+ break;
+ default:
+ // bad option (unreachable)
+ break;
+ }
+ return *this;
+}
+
+void UnicodeSet::closeOverCaseInsensitive(bool simple) {
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet(*this);
+ // Full case mappings closure:
+ // Remove strings because the strings will actually be reduced (folded);
+ // therefore, start with no strings and add only those needed.
+ // Do this before processing code points, because they may add strings.
+ if (!simple && foldSet.hasStrings()) {
+ foldSet.strings->removeAllElements();
+ }
+
+ USetAdder sa = {
+ foldSet.toUSet(),
+ _set_add,
+ _set_addRange,
+ _set_addString,
+ nullptr, // don't need remove()
+ nullptr // don't need removeRange()
+ };
+
+ UnicodeSet subset(0, 0x10ffff);
+ const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
- int32_t n = getRangeCount();
- UChar32 result;
- const char16_t *full;
-
- for (int32_t i=0; i<n; ++i) {
- UChar32 start = getRangeStart(i);
- UChar32 end = getRangeEnd(i);
-
- if (attribute & USET_CASE_INSENSITIVE) {
- // full case closure
- for (UChar32 cp=start; cp<=end; ++cp) {
- ucase_addCaseClosure(cp, &sa);
- }
- } else {
- // add case mappings
- // (does not add long s for regular s, or Kelvin for k, for example)
- for (UChar32 cp=start; cp<=end; ++cp) {
- result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
- addCaseMapping(foldSet, result, full, str);
-
- result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
- addCaseMapping(foldSet, result, full, str);
-
- result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
- addCaseMapping(foldSet, result, full, str);
-
- result = ucase_toFullFolding(cp, &full, 0);
- addCaseMapping(foldSet, result, full, str);
- }
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int32_t n = codePoints.getRangeCount();
+
+ for (int32_t i=0; i<n; ++i) {
+ UChar32 start = codePoints.getRangeStart(i);
+ UChar32 end = codePoints.getRangeEnd(i);
+
+ if (simple) {
+ for (UChar32 cp=start; cp<=end; ++cp) {
+ ucase_addSimpleCaseClosure(cp, &sa);
+ }
+ } else {
+ for (UChar32 cp=start; cp<=end; ++cp) {
+ ucase_addCaseClosure(cp, &sa);
+ }
+ }
+ }
+ if (hasStrings()) {
+ UnicodeString str;
+ for (int32_t j=0; j<strings->size(); ++j) {
+ const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
+ if (simple) {
+ if (scfString(*pStr, str)) {
+ foldSet.remove(*pStr).add(str);
+ }
+ } else {
+ str = *pStr;
+ str.foldCase();
+ if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
+ foldSet.add(str); // does not map to code points: add the folded string itself
}
}
- if (hasStrings()) {
- if (attribute & USET_CASE_INSENSITIVE) {
- for (int32_t j=0; j<strings->size(); ++j) {
- str = *(const UnicodeString *) strings->elementAt(j);
- str.foldCase();
- if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
- foldSet.add(str); // does not map to code points: add the folded string itself
- }
- }
- } else {
- Locale root("");
-#if !UCONFIG_NO_BREAK_ITERATION
- UErrorCode status = U_ZERO_ERROR;
- BreakIterator *bi = BreakIterator::createWordInstance(root, status);
- if (U_SUCCESS(status)) {
-#endif
- const UnicodeString *pStr;
+ }
+ }
+ *this = foldSet;
+}
+
+void UnicodeSet::closeOverAddCaseMappings() {
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet(*this);
+
+ UnicodeSet subset(0, 0x10ffff);
+ const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset);
- for (int32_t j=0; j<strings->size(); ++j) {
- pStr = (const UnicodeString *) strings->elementAt(j);
- (str = *pStr).toLower(root);
- foldSet.add(str);
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int32_t n = codePoints.getRangeCount();
+ UChar32 result;
+ const char16_t *full;
+ UnicodeString str;
+
+ for (int32_t i=0; i<n; ++i) {
+ UChar32 start = codePoints.getRangeStart(i);
+ UChar32 end = codePoints.getRangeEnd(i);
+
+ // add case mappings
+ // (does not add long s for regular s, or Kelvin for k, for example)
+ for (UChar32 cp=start; cp<=end; ++cp) {
+ result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+ addCaseMapping(foldSet, result, full, str);
+
+ result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+ addCaseMapping(foldSet, result, full, str);
+
+ result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT);
+ addCaseMapping(foldSet, result, full, str);
+
+ result = ucase_toFullFolding(cp, &full, 0);
+ addCaseMapping(foldSet, result, full, str);
+ }
+ }
+ if (hasStrings()) {
+ Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
- (str = *pStr).toTitle(bi, root);
- foldSet.add(str);
+ UErrorCode status = U_ZERO_ERROR;
+ BreakIterator *bi = BreakIterator::createWordInstance(root, status);
+ if (U_SUCCESS(status)) {
#endif
- (str = *pStr).toUpper(root);
- foldSet.add(str);
- (str = *pStr).foldCase();
- foldSet.add(str);
- }
+ for (int32_t j=0; j<strings->size(); ++j) {
+ const UnicodeString *pStr = (const UnicodeString *) strings->elementAt(j);
+ (str = *pStr).toLower(root);
+ foldSet.add(str);
#if !UCONFIG_NO_BREAK_ITERATION
- }
- delete bi;
+ (str = *pStr).toTitle(bi, root);
+ foldSet.add(str);
#endif
- }
+ (str = *pStr).toUpper(root);
+ foldSet.add(str);
+ (str = *pStr).foldCase();
+ foldSet.add(str);
}
- *this = foldSet;
+#if !UCONFIG_NO_BREAK_ITERATION
}
+ delete bi;
+#endif
}
- return *this;
+ *this = foldSet;
}
U_NAMESPACE_END
* to close over case BEFORE COMPLEMENTING. This makes
* patterns like /[^abc]/i work.
*/
- if ((options & USET_CASE_INSENSITIVE) != 0) {
- (this->*caseClosure)(USET_CASE_INSENSITIVE);
- }
- else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
- (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
+ if ((options & USET_CASE_MASK) != 0) {
+ (this->*caseClosure)(options);
}
if (invert) {
complement().removeAllStrings(); // code point complement
public:
CharacterProperties() = delete;
static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
+ static const UnicodeSet *getBinaryPropertySet(UProperty property, UErrorCode &errorCode);
};
// implemented in uniset_props.cpp
U_CDECL_END
-#endif
+#ifdef __cplusplus
+
+namespace {
+
+constexpr int32_t USET_CASE_MASK = USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS;
+} // namespace
+
+#endif // __cplusplus
+
+#endif
#include <stdio.h>
#include <string.h>
+#include <unordered_map>
#include "unicode/utypes.h"
#include "usettest.h"
#include "unicode/ucnv.h"
TESTCASE_AUTO(TestStrings);
TESTCASE_AUTO(Testj2268);
TESTCASE_AUTO(TestCloseOver);
+ TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
+ TESTCASE_AUTO(TestCloseOverLargeSets);
TESTCASE_AUTO(TestEscapePattern);
TESTCASE_AUTO(TestInvalidCodePoint);
TESTCASE_AUTO(TestSymbolTable);
* Test closure API.
*/
void UnicodeSetTest::TestCloseOver() {
- UErrorCode ec = U_ZERO_ERROR;
-
- char CASE[] = {(char)USET_CASE_INSENSITIVE};
- char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
- const char* DATA[] = {
+ static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
+ static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
+ static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
+ static const char* DATA[] = {
// selector, input, output
CASE,
"[aq\\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
+ SIMPLE_CASE_INSENSITIVE,
+ "[aq\\u00DF{Bc}{bC}{Fi}]",
+ "[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
+
CASE,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
+ SIMPLE_CASE_INSENSITIVE,
+ "[\\u01F1]", // 'DZ'
+ "[\\u01F1\\u01F2\\u01F3]",
+
CASE,
"[\\u1FB4]",
"[\\u1FB4{\\u03AC\\u03B9}]",
+ SIMPLE_CASE_INSENSITIVE,
+ "[\\u1FB4]",
+ "[\\u1FB4]",
+
CASE,
"[{F\\uFB01}]",
- "[\\uFB03{ffi}]",
+ "[\\uFB03{ffi}]",
CASE, // make sure binary search finds limits
"[a\\uFF3A]",
CASE,
"[a-z]","[A-Za-z\\u017F\\u212A]",
+
+ SIMPLE_CASE_INSENSITIVE,
+ "[a-z]","[A-Za-z\\u017F\\u212A]",
+
CASE,
"[abc]","[A-Ca-c]",
CASE,
CASE_MAPPINGS,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
-
+
CASE_MAPPINGS,
"[a-z]",
"[A-Za-z]",
int32_t selector = DATA[i][0];
UnicodeString pat(DATA[i+1], -1, US_INV);
UnicodeString exp(DATA[i+2], -1, US_INV);
+
+ UErrorCode ec = U_ZERO_ERROR;
s.applyPattern(pat, ec);
s.closeOver(selector);
t.applyPattern(exp, ec);
}
}
-#if 0
- /*
- * Unused test code.
- * This was used to compare the old implementation (using USET_CASE)
- * with the new one (using 0x100 temporarily)
- * while transitioning from hardcoded case closure tables in uniset.cpp
- * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
- * and using ucase.c functions for closure.
- * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
- *
- * Note: The old and new implementation never fully matched because
- * the old implementation turned out to not map U+0130 and U+0131 correctly
- * (dotted I and dotless i) and because the old implementation's data tables
- * were outdated compared to Unicode 4.0.1 at the time of the change to the
- * new implementation. (So sigmas and some other characters were not handled
- * according to the newer Unicode version.)
- */
- UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
- UnicodeSetIterator si(sens);
- UnicodeString str, buf2;
- const UnicodeString *pStr;
- UChar32 c;
- while(si.next()) {
- if(!si.isString()) {
- c=si.getCodepoint();
- s.clear();
- s.add(c);
-
- str.setTo(c);
- str.foldCase();
- sens2.add(str);
-
- t=s;
- s.closeOver(USET_CASE);
- t.closeOver(0x100);
- if(s!=t) {
- errln("FAIL: closeOver(U+%04x) differs: ", c);
- errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
- }
- }
- }
- // remove all code points
- // should contain all full case folding mapping strings
- sens2.remove(0, 0x10ffff);
- si.reset(sens2);
- while(si.next()) {
- if(si.isString()) {
- pStr=&si.getString();
- s.clear();
- s.add(*pStr);
- t=s2=s;
- s.closeOver(USET_CASE);
- t.closeOver(0x100);
- if(s!=t) {
- errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, true)+") differs: ");
- errln((UnicodeString)"old "+s.toPattern(buf, true)+" new: "+t.toPattern(buf2, true));
- }
- }
- }
-#endif
-
// Test the pattern API
+ UErrorCode ec = U_ZERO_ERROR;
s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
if (U_FAILURE(ec)) {
errln("FAIL: applyPattern failed");
}
}
+namespace {
+
+void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
+ std::unordered_multimap<UChar32, UChar32> &additions) {
+ for (auto it = closure.find(c);; ++it) {
+ if (it == closure.end() || it->first != c) {
+ // absent
+ additions.insert({c, t});
+ break;
+ } else if (it->second == t) {
+ // present
+ break;
+ }
+ }
+}
+
+} // namespace
+
+void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
+ IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
+ const UnicodeSet *sensitive =
+ UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
+ if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
+ return;
+ }
+ // Compute the scf=Simple_Case_Folding closure:
+ // For each scf(c)=t, start with mappings c->t and t->c.
+ std::unordered_multimap<UChar32, UChar32> closure;
+ UnicodeSetIterator iter(*sensitive);
+ while (iter.next()) {
+ UChar32 c = iter.getCodepoint();
+ UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+ if (scfChar != c) {
+ closure.insert({c, scfChar});
+ closure.insert({scfChar, c});
+ }
+ }
+ // Complete the closure: Add mappings of mappings.
+ for (;;) {
+ std::unordered_multimap<UChar32, UChar32> additions;
+ // for each mapping c->t
+ for (auto mapping : closure) {
+ UChar32 c = mapping.first;
+ UChar32 t = mapping.second;
+ // enumerate each t->u
+ for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
+ UChar32 u = it->second;
+ if (u != c) {
+ addIfAbsent(closure, c, u, additions);
+ addIfAbsent(closure, u, c, additions);
+ }
+ }
+ }
+ if (additions.empty()) {
+ break; // The closure is complete.
+ }
+ closure.insert(additions.begin(), additions.end());
+ }
+ // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
+ // Here we focus on single code points as input.
+ // Other examples, including strings, are tested in TestCloseOver().
+ int32_t errors = 0;
+ iter.reset();
+ UnicodeSet set, expected;
+ while (iter.next()) {
+ UChar32 c = iter.getCodepoint();
+ // closeOver()
+ set.clear().add(c);
+ set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
+ // From-first-principles implementation.
+ expected.clear().add(c);
+ for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
+ expected.add(it->second);
+ }
+ // compare
+ if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
+ errln(" c=U+%04X", c);
+ if (++errors == 10) {
+ break;
+ }
+ }
+ }
+}
+
+void UnicodeSetTest::TestCloseOverLargeSets() {
+ IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
+ // Check that an optimization for large sets does not change the result.
+
+ // Most code points except ones that are boring for case mappings.
+ UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
+ // Main Unihan block.
+ constexpr UChar32 LARGE_START = 0x4E00;
+ constexpr UChar32 LARGE_END = 0x9FFF;
+
+ static constexpr int32_t OPTIONS[] = {
+ USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
+ };
+ UnicodeSet input, small, large;
+ for (int32_t option : OPTIONS) {
+ UnicodeSetIterator iter(manyCp);
+ while (iter.next()) {
+ UChar32 c = iter.getCodepoint();
+ input.clear().add(c);
+ small = input;
+ small.closeOver(option);
+ large = input;
+ large.add(LARGE_START, LARGE_END);
+ large.closeOver(option);
+ large.remove(LARGE_START, LARGE_END);
+ if (!checkEqual(small, large, "small != large")) {
+ errln(" option=%d c=U+%04X", option, c);
+ break;
+ }
+ }
+ }
+}
+
void UnicodeSetTest::TestEscapePattern() {
const char pattern[] =
"[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
void TestExhaustive(void);
void TestCloseOver(void);
+ void TestCloseOverSimpleCaseFolding();
+ void TestCloseOverLargeSets();
void TestEscapePattern(void);
* - for k include the Kelvin sign
*/
public final void addCaseClosure(int c, UnicodeSet set) {
- /*
- * Hardcode the case closure of i and its relatives and ignore the
- * data file data for these characters.
- * The Turkic dotless i and dotted I with their case mapping conditions
- * and case folding option make the related characters behave specially.
- * This code matches their closure behavior to their case folding behavior.
- */
-
- switch(c) {
- case 0x49:
- /* regular i and I are in one equivalence class */
- set.add(0x69);
- return;
- case 0x69:
- set.add(0x49);
- return;
- case 0x130:
- /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
- set.add(iDot);
- return;
- case 0x131:
- /* dotless i is in a class by itself */
- return;
- default:
- /* otherwise use the data file data */
- break;
- }
-
int props=trie.get(c);
if(!propsHasException(props)) {
if(getTypeFromProps(props)!=NONE) {
* c has exceptions, so there may be multiple simple and/or
* full case mappings. Add them all.
*/
- int excOffset0, excOffset=getExceptionsOffset(props);
- int closureOffset;
+ int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
- int index, closureLength, fullLength, length;
-
- excOffset0=excOffset;
+ int excOffset0=excOffset;
+
+ // Hardcode the case closure of i and its relatives and ignore the
+ // data file data for these characters.
+ // The Turkic dotless i and dotted I with their case mapping conditions
+ // and case folding option make the related characters behave specially.
+ // This code matches their closure behavior to their case folding behavior.
+ if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
+ // These characters have Turkic case foldings. Hardcode their closure.
+ if (c == 0x49) {
+ // Regular i and I are in one equivalence class.
+ set.add(0x69);
+ return;
+ } else if (c == 0x130) {
+ // Dotted I is in a class with <0069 0307>
+ // (for canonical equivalence with <0049 0307>).
+ set.add(iDot);
+ return;
+ }
+ } else if (c == 0x69) {
+ set.add(0x49);
+ return;
+ } else if (c == 0x131) {
+ // Dotless i is in a class by itself.
+ return;
+ }
/* add all simple case mappings */
- for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
+ for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
if(hasSlot(excWord, index)) {
excOffset=excOffset0;
- c=getSlotValue(excWord, index, excOffset);
- set.add(c);
+ int mapping=getSlotValue(excWord, index, excOffset);
+ set.add(mapping);
}
}
if(hasSlot(excWord, EXC_DELTA)) {
}
/* get the closure string pointer & length */
+ int closureOffset, closureLength;
if(hasSlot(excWord, EXC_CLOSURE)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
excOffset=excOffset0;
long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
- fullLength=(int)value;
+ int fullLength=(int)value;
/* start of full case mapping strings */
excOffset=(int)(value>>32)+1;
fullLength>>=4;
/* add the full case folding string */
- length=fullLength&0xf;
+ int length=fullLength&0xf;
if(length!=0) {
set.add(exceptions.substring(excOffset, excOffset+length));
excOffset+=length;
/* add each code point in the closure string */
int limit=closureOffset+closureLength;
- for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
- c=exceptions.codePointAt(index);
- set.add(c);
+ for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
+ int mapping=exceptions.codePointAt(index);
+ set.add(mapping);
+ }
+ }
+ }
+
+ /**
+ * Add the simple case closure mapping,
+ * except if there is not actually an scf relationship between the two characters.
+ * TODO: Unicode should probably add the corresponding scf mappings.
+ * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
+ * If & when those scf mappings are added, we should be able to remove all of these exceptions.
+ */
+ private static void addOneSimpleCaseClosure(int c, int t, UnicodeSet set) {
+ switch (c) {
+ case 0x0390:
+ if (t == 0x1FD3) { return; }
+ break;
+ case 0x03B0:
+ if (t == 0x1FE3) { return; }
+ break;
+ case 0x1FD3:
+ if (t == 0x0390) { return; }
+ break;
+ case 0x1FE3:
+ if (t == 0x03B0) { return; }
+ break;
+ case 0xFB05:
+ if (t == 0xFB06) { return; }
+ break;
+ case 0xFB06:
+ if (t == 0xFB05) { return; }
+ break;
+ default:
+ break;
+ }
+ set.add(t);
+ }
+
+ public final void addSimpleCaseClosure(int c, UnicodeSet set) {
+ int props=trie.get(c);
+ if(!propsHasException(props)) {
+ if(getTypeFromProps(props)!=NONE) {
+ /* add the one simple case mapping, no matter what type it is */
+ int delta=getDelta(props);
+ if(delta!=0) {
+ set.add(c+delta);
+ }
+ }
+ } else {
+ // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
+ int excOffset=getExceptionsOffset(props);
+ int excWord=exceptions.charAt(excOffset++);
+ int excOffset0=excOffset;
+
+ // Hardcode the case closure of i and its relatives and ignore the
+ // data file data for these characters, like in ucase_addCaseClosure().
+ if ((excWord&EXC_CONDITIONAL_FOLD) != 0) {
+ // These characters have Turkic case foldings. Hardcode their closure.
+ if (c == 0x49) {
+ // Regular i and I are in one equivalence class.
+ set.add(0x69);
+ return;
+ } else if (c == 0x130) {
+ // For scf=Simple_Case_Folding, dotted I is in a class by itself.
+ return;
+ }
+ } else if (c == 0x69) {
+ set.add(0x49);
+ return;
+ } else if (c == 0x131) {
+ // Dotless i is in a class by itself.
+ return;
+ }
+
+ // Add all simple case mappings.
+ for(int index=EXC_LOWER; index<=EXC_TITLE; ++index) {
+ if(hasSlot(excWord, index)) {
+ excOffset=excOffset0;
+ int mapping=getSlotValue(excWord, index, excOffset);
+ addOneSimpleCaseClosure(c, mapping, set);
+ }
+ }
+ if(hasSlot(excWord, EXC_DELTA)) {
+ excOffset=excOffset0;
+ int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
+ int mapping = (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
+ addOneSimpleCaseClosure(c, mapping, set);
+ }
+
+ /* get the closure string pointer & length */
+ int closureOffset, closureLength;
+ if(hasSlot(excWord, EXC_CLOSURE)) {
+ excOffset=excOffset0;
+ long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
+ closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
+ closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
+ } else {
+ closureLength=0;
+ closureOffset=0;
+ }
+
+ // Skip the full case mappings.
+ if(closureLength > 0 && hasSlot(excWord, EXC_FULL_MAPPINGS)) {
+ excOffset=excOffset0;
+ long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
+ int fullLength=(int)value;
+
+ /* start of full case mapping strings */
+ excOffset=(int)(value>>32)+1;
+
+ fullLength&=0xffff; /* bits 16 and higher are reserved */
+
+ // Skip all 4 full case mappings.
+ excOffset+=fullLength&FULL_LOWER;
+ fullLength>>=4;
+ excOffset+=fullLength&0xf;
+ fullLength>>=4;
+ excOffset+=fullLength&0xf;
+ fullLength>>=4;
+ excOffset+=fullLength;
+
+ closureOffset=excOffset; /* behind full case mappings */
+ }
+
+ // Add each code point in the closure string whose scf maps back to c.
+ int limit=closureOffset+closureLength;
+ for(int index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
+ int mapping=exceptions.codePointAt(index);
+ addOneSimpleCaseClosure(c, mapping, set);
}
}
}
* for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern contains
* a syntax error.
* @stable ICU 3.8
* @param symbols a symbol table mapping variables to char[] arrays
* and chars to UnicodeSets
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
* @stable ICU 3.2
* See the class description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
* @exception java.lang.IllegalArgumentException if the pattern
* contains a syntax error.
* @stable ICU 3.8
* variables, or null if none.
* @param rebuiltPat the pattern that was parsed, rebuilt or
* copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
+ * @param options a bit mask.
+ * Valid options are {@link #IGNORE_SPACE} and
+ * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
*/
private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
Appendable rebuiltPat, int options, int depth) {
* to close over case BEFORE COMPLEMENTING. This makes
* patterns like /[^abc]/i work.
*/
- if ((options & CASE) != 0) {
- closeOver(CASE);
+ if ((options & CASE_MASK) != 0) {
+ closeOver(options);
}
if (invert) {
complement().removeAllStrings(); // code point complement
public static final int IGNORE_SPACE = 1;
/**
- * Bitmask for constructor, applyPattern(), and closeOver()
- * indicating letter case. This may be ORed together with other
- * selectors.
+ * Alias for {@link #CASE_INSENSITIVE}.
*
+ * @stable ICU 3.8
+ */
+ public static final int CASE = 2;
+
+ /**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
- * closure over case mappings, e.g. U+017F for s.
+ * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
*
- * The resulting set is a superset of the input for the code points but
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * <p>The resulting set is a superset of the input for the code points but
* not for the strings.
* It performs a case mapping closure of the code points and adds
* full case folding strings for the code points, and reduces strings of
* the original set to their full case folding equivalents.
*
- * This is designed for case-insensitive matches, for example
+ * <p>This is designed for case-insensitive matches, for example
* in regular expressions. The full code point case closure allows checking of
* an input character directly against the closure set.
* Strings are matched by comparing the case-folded form from the closure
* set with an incremental case folding of the string in question.
*
- * The closure set will also contain single code points if the original
+ * <p>The closure set will also contain single code points if the original
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
* This is not necessary (that is, redundant) for the above matching method
* but results in the same closure sets regardless of whether the original
* set contained the code point or a string.
- * @stable ICU 3.8
- */
- public static final int CASE = 2;
-
- /**
- * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
- * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
- * @see #CASE
+ *
* @stable ICU 3.4
*/
public static final int CASE_INSENSITIVE = 2;
/**
- * Bitmask for constructor, applyPattern(), and closeOver()
- * indicating letter case. This may be ORed together with other
- * selectors.
- *
- * Enable case insensitive matching. E.g., "[ab]" with this flag
- * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
- * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
- * title-, and uppercase mappings as well as the case folding
+ * Adds all case mappings for each element in the set.
+ * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
* of each existing element in the set.
+ *
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * <p>Unlike the “case insensitive” options, this does not perform a closure.
+ * For example, it does not add 'ſ' (U+017F long s) for 's',
+ * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
+ *
* @stable ICU 3.4
*/
public static final int ADD_CASE_MAPPINGS = 4;
+ /**
+ * Enable case insensitive matching.
+ * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
+ * which map each code point to one code point,
+ * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
+ *
+ * <p>This is designed for case-insensitive matches, for example in certain
+ * regular expression implementations where only Simple_Case_Folding mappings are used,
+ * such as in ECMAScript (JavaScript) regular expressions.
+ *
+ * <p>This value is an options bit set value for some
+ * constructors, applyPattern(), and closeOver().
+ * It can be ORed together with other, unrelated options.
+ *
+ * @draft ICU 73
+ */
+ public static final int SIMPLE_CASE_INSENSITIVE = 6;
+
+ private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
+
// add the result of a full case mapping to the set
// use str as a temporary string to avoid constructing one
private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
// see UCaseProps
}
+ /** For case closure on a large set, look only at code points with relevant properties. */
+ UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
+ if (src.size() < 30) {
+ return src;
+ }
+ // Return the intersection of the src code points with Case_Sensitive ones.
+ UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+ // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
+ if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
+ return sensitive.cloneAsThawed().retainAll(src);
+ } else {
+ return ((UnicodeSet) src.clone()).retainAll(sensitive);
+ }
+ }
+
+ // Per-character scf = Simple_Case_Folding of a string.
+ // (Normally when we case-fold a string we use full case foldings.)
+ private static final boolean scfString(CharSequence s, StringBuilder scf) {
+ int length = s.length();
+ // Loop while not needing modification.
+ for (int i = 0; i < length;) {
+ int c = Character.codePointAt(s, i);
+ int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+ if (scfChar != c) {
+ // Copy the characters before c.
+ scf.setLength(0);
+ scf.append(s, 0, i);
+ // Loop over the rest of the string and keep case-folding.
+ for (;;) {
+ scf.appendCodePoint(scfChar);
+ i += Character.charCount(c);
+ if (i == length) {
+ return true;
+ }
+ c = Character.codePointAt(s, i);
+ scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+ }
+ }
+ i += Character.charCount(c);
+ }
+ return false;
+ }
+
/**
* Close this set over the given attribute. For the attribute
- * CASE, the result is to modify this set so that:
+ * {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
*
- * 1. For each character or string 'a' in this set, all strings
+ * <ol>
+ * <li>For each character or string 'a' in this set, all strings
* 'b' such that foldCase(a) == foldCase(b) are added to this set.
* (For most 'a' that are single characters, 'b' will have
* b.length() == 1.)
*
- * 2. For each string 'e' in the resulting set, if e !=
+ * <li>For each string 'e' in the resulting set, if e !=
* foldCase(e), 'e' will be removed.
+ * </ol>
*
- * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
+ * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
*
- * (Here foldCase(x) refers to the operation
+ * <p>(Here foldCase(x) refers to the operation
* UCharacter.foldCase(x, true), and a == b actually denotes
* a.equals(b), not pointer comparison.)
*
* @param attribute bitmask for attributes to close over.
- * Currently only the CASE bit is supported. Any undefined bits
- * are ignored.
+ * Valid options:
+ * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
+ * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
+ * Unrelated options bits are ignored.
* @return a reference to this set.
* @stable ICU 3.8
*/
public UnicodeSet closeOver(int attribute) {
checkFrozen();
- if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
- UCaseProps csp = UCaseProps.INSTANCE;
- UnicodeSet foldSet = new UnicodeSet(this);
- ULocale root = ULocale.ROOT;
-
- // start with input set to guarantee inclusion
- // CASE: remove strings because the strings will actually be reduced (folded);
- // therefore, start with no strings and add only those needed
- if((attribute & CASE) != 0 && foldSet.hasStrings()) {
- foldSet.strings.clear();
- }
-
- int n = getRangeCount();
- int result;
- StringBuilder full = new StringBuilder();
+ switch (attribute & CASE_MASK) {
+ case 0:
+ break;
+ case CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ false);
+ break;
+ case ADD_CASE_MAPPINGS:
+ closeOverAddCaseMappings();
+ break;
+ case SIMPLE_CASE_INSENSITIVE:
+ closeOverCaseInsensitive(/* simple= */ true);
+ break;
+ default:
+ // bad option (unreachable)
+ break;
+ }
+ return this;
+ }
- for (int i=0; i<n; ++i) {
- int start = getRangeStart(i);
- int end = getRangeEnd(i);
+ private void closeOverCaseInsensitive(boolean simple) {
+ UCaseProps csp = UCaseProps.INSTANCE;
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet = new UnicodeSet(this);
- if((attribute & CASE) != 0) {
- // full case closure
- for (int cp=start; cp<=end; ++cp) {
- csp.addCaseClosure(cp, foldSet);
- }
- } else {
- // add case mappings
- // (does not add long s for regular s, or Kelvin for k, for example)
- for (int cp=start; cp<=end; ++cp) {
- result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ // Full case mappings closure:
+ // Remove strings because the strings will actually be reduced (folded);
+ // therefore, start with no strings and add only those needed.
+ // Do this before processing code points, because they may add strings.
+ if (!simple && foldSet.hasStrings()) {
+ foldSet.strings.clear();
+ }
- result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
- result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
- addCaseMapping(foldSet, result, full);
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int n = codePoints.getRangeCount();
+ for (int i=0; i<n; ++i) {
+ int start = codePoints.getRangeStart(i);
+ int end = codePoints.getRangeEnd(i);
- result = csp.toFullFolding(cp, full, 0);
- addCaseMapping(foldSet, result, full);
- }
+ if (simple) {
+ for (int cp=start; cp<=end; ++cp) {
+ csp.addSimpleCaseClosure(cp, foldSet);
+ }
+ } else {
+ for (int cp=start; cp<=end; ++cp) {
+ csp.addCaseClosure(cp, foldSet);
}
}
- if (hasStrings()) {
- if ((attribute & CASE) != 0) {
- for (String s : strings) {
- String str = UCharacter.foldCase(s, 0);
- if(!csp.addStringCaseClosure(str, foldSet)) {
- foldSet.add(str); // does not map to code points: add the folded string itself
- }
+ }
+ if (hasStrings()) {
+ StringBuilder sb = simple ? new StringBuilder() : null;
+ for (String s : strings) {
+ if (simple) {
+ if (scfString(s, sb)) {
+ foldSet.remove(s).add(sb);
}
} else {
- BreakIterator bi = BreakIterator.getWordInstance(root);
- for (String str : strings) {
- // TODO: call lower-level functions
- foldSet.add(UCharacter.toLowerCase(root, str));
- foldSet.add(UCharacter.toTitleCase(root, str, bi));
- foldSet.add(UCharacter.toUpperCase(root, str));
- foldSet.add(UCharacter.foldCase(str, 0));
+ String str = UCharacter.foldCase(s, 0);
+ if(!csp.addStringCaseClosure(str, foldSet)) {
+ foldSet.add(str); // does not map to code points: add the folded string itself
}
}
}
- set(foldSet);
}
- return this;
+ set(foldSet);
+ }
+
+ private void closeOverAddCaseMappings() {
+ UCaseProps csp = UCaseProps.INSTANCE;
+ // Start with input set to guarantee inclusion.
+ UnicodeSet foldSet = new UnicodeSet(this);
+
+ UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
+
+ // Iterate over the ranges of single code points. Nested loop for each code point.
+ int n = codePoints.getRangeCount();
+ int result;
+ StringBuilder full = new StringBuilder();
+
+ for (int i=0; i<n; ++i) {
+ int start = codePoints.getRangeStart(i);
+ int end = codePoints.getRangeEnd(i);
+
+ // add case mappings
+ // (does not add long s for regular s, or Kelvin for k, for example)
+ for (int cp=start; cp<=end; ++cp) {
+ result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
+ addCaseMapping(foldSet, result, full);
+
+ result = csp.toFullFolding(cp, full, 0);
+ addCaseMapping(foldSet, result, full);
+ }
+ }
+ if (hasStrings()) {
+ ULocale root = ULocale.ROOT;
+ BreakIterator bi = BreakIterator.getWordInstance(root);
+ for (String str : strings) {
+ // TODO: call lower-level functions
+ foldSet.add(UCharacter.toLowerCase(root, str));
+ foldSet.add(UCharacter.toTitleCase(root, str, bi));
+ foldSet.add(UCharacter.toUpperCase(root, str));
+ foldSet.add(UCharacter.foldCase(str, 0));
+ }
+ }
+ set(foldSet);
}
/**
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import com.ibm.icu.dev.util.CollectionUtilities;
import com.ibm.icu.impl.SortedSetRelation;
import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.CharacterProperties;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
import com.ibm.icu.lang.UProperty;
@Test
public void TestCloseOver() {
String CASE = String.valueOf(UnicodeSet.CASE);
+ String CASE_MAPPINGS = String.valueOf(UnicodeSet.ADD_CASE_MAPPINGS);
+ String SIMPLE_CASE_INSENSITIVE = String.valueOf(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
String[] DATA = {
// selector, input, output
CASE,
"[aq\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
+ SIMPLE_CASE_INSENSITIVE,
+ "[aq\u00DF{Bc}{bC}{Fi}]",
+ "[aAqQ\u00DF\u1E9E{bc}{fi}]",
+
CASE,
"[\u01F1]", // 'DZ'
"[\u01F1\u01F2\u01F3]",
+ SIMPLE_CASE_INSENSITIVE,
+ "[\u01F1]", // 'DZ'
+ "[\u01F1\u01F2\u01F3]",
+
CASE,
"[\u1FB4]",
"[\u1FB4{\u03AC\u03B9}]",
+ SIMPLE_CASE_INSENSITIVE,
+ "[\u1FB4]",
+ "[\u1FB4]",
+
CASE,
"[{F\uFB01}]",
"[\uFB03{ffi}]",
+ CASE, // make sure binary search finds limits
+ "[a\uFF3A]",
+ "[aA\uFF3A\uFF5A]",
+
CASE,
"[a-z]","[A-Za-z\u017F\u212A]",
+
+ SIMPLE_CASE_INSENSITIVE,
+ "[a-z]","[A-Za-z\u017F\u212A]",
+
CASE,
"[abc]","[A-Ca-c]",
CASE,
"[ABC]","[A-Ca-c]",
+
+ CASE, "[i]", "[iI]",
+
+ CASE, "[\u0130]", "[\u0130{i\u0307}]", // dotted I
+ CASE, "[{i\u0307}]", "[\u0130{i\u0307}]", // i with dot
+
+ CASE, "[\u0131]", "[\u0131]", // dotless i
+
+ CASE, "[\u0390]", "[\u0390\u1FD3{\u03B9\u0308\u0301}]",
+
+ CASE, "[\u03c2]", "[\u03a3\u03c2\u03c3]", // sigmas
+
+ CASE, "[\u03f2]", "[\u03f2\u03f9]", // lunate sigmas
+
+ CASE, "[\u03f7]", "[\u03f7\u03f8]",
+
+ CASE, "[\u1fe3]", "[\u03b0\u1fe3{\u03c5\u0308\u0301}]",
+
+ CASE, "[\ufb05]", "[\ufb05\ufb06{st}]",
+ CASE, "[{st}]", "[\ufb05\ufb06{st}]",
+
+ CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
+
+ CASE, "[{a\u02BE}]", "[\u1E9A{a\u02BE}]", // first in sorted table
+
+ CASE, "[{\u1f7c\u03b9}]", "[\u1ff2{\u1f7c\u03b9}]", // last in sorted table
+
+ CASE_MAPPINGS,
+ "[aq\u00DF{Bc}{bC}{Fi}]",
+ "[aAqQ\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
+
+ CASE_MAPPINGS,
+ "[\u01F1]", // 'DZ'
+ "[\u01F1\u01F2\u01F3]",
+
+ CASE_MAPPINGS,
+ "[a-z]",
+ "[A-Za-z]",
};
UnicodeSet s = new UnicodeSet();
UnicodeSet t = new UnicodeSet();
for (int i=0; i<DATA.length; i+=3) {
int selector = Integer.parseInt(DATA[i]);
- String pat = DATA[i+1];
- String exp = DATA[i+2];
+ String pat = Utility.unescape(DATA[i+1]);
+ String exp = Utility.unescape(DATA[i+2]);
s.applyPattern(pat);
s.closeOver(selector);
t.applyPattern(exp);
expectContainment(s, "abcABC", "defDEF");
s = new UnicodeSet("[^abc]", UnicodeSet.CASE);
expectContainment(s, "defDEF", "abcABC");
+ s = new UnicodeSet("[abck]", UnicodeSet.ADD_CASE_MAPPINGS);
+ expectContainment(s, "abckABCK", "defDEF\u212A");
+ }
+
+ private void add(Map<Integer, Collection<Integer>> closure, Integer c, Integer t) {
+ Collection<Integer> values = closure.get(c);
+ if (values == null) {
+ values = new TreeSet<>();
+ closure.put(c, values);
+ }
+ values.add(t);
+ }
+
+ private void addIfAbsent(Map<Integer, Collection<Integer>> closure, Integer c, Integer t,
+ Map<Integer, Collection<Integer>> additions) {
+ Collection<Integer> values = closure.get(c);
+ if (values == null || !values.contains(t)) {
+ if (additions != closure) {
+ values = additions.get(c);
+ }
+ if (values == null) {
+ values = new TreeSet<>();
+ additions.put(c, values);
+ }
+ values.add(t);
+ }
+ }
+
+ @Test
+ public void TestCloseOverSimpleCaseFolding() {
+ UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
+ // Compute the scf=Simple_Case_Folding closure:
+ // For each scf(c)=t, start with mappings c->t and t->c.
+
+ // Poor man's multimap from code points to code points.
+ Map<Integer, Collection<Integer>> closure = new HashMap<>();
+ UnicodeSetIterator iter = new UnicodeSetIterator(sensitive);
+ while (iter.next()) {
+ int c = iter.codepoint;
+ int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
+ if (scfChar != c) {
+ add(closure, c, scfChar);
+ add(closure, scfChar, c);
+ }
+ }
+ // Complete the closure: Add mappings of mappings.
+ Map<Integer, Collection<Integer>> additions = new HashMap<>();
+ for (;;) {
+ // for each mapping c->t
+ for (Map.Entry<Integer, Collection<Integer>> entry : closure.entrySet()) {
+ Integer c = entry.getKey();
+ Collection<Integer> cValues = entry.getValue();
+ for (Integer t : cValues) {
+ // enumerate each t->u
+ Collection<Integer> tValues = closure.get(t);
+ if (tValues != null) {
+ for (Integer u : tValues) {
+ if (!u.equals(c)) {
+ addIfAbsent(closure, c, u, additions);
+ addIfAbsent(closure, u, c, additions);
+ }
+ }
+ }
+ }
+
+ }
+ if (additions.isEmpty()) {
+ break; // The closure is complete.
+ }
+ // Add all of the additions back into the closure.
+ for (Map.Entry<Integer, Collection<Integer>> entry : additions.entrySet()) {
+ Integer c = entry.getKey();
+ Collection<Integer> cValues = entry.getValue();
+ Collection<Integer> closureValues = closure.get(c);
+ if (closureValues == null) {
+ closureValues = new TreeSet<>();
+ closure.put(c, closureValues);
+ }
+ closureValues.addAll(cValues);
+ }
+ additions.clear();
+ }
+ // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
+ // Here we focus on single code points as input.
+ // Other examples, including strings, are tested in TestCloseOver().
+ int errors = 0;
+ iter.reset();
+ UnicodeSet set = new UnicodeSet(), expected = new UnicodeSet();
+ while (iter.next()) {
+ int c = iter.codepoint;
+ // closeOver()
+ set.clear().add(c);
+ set.closeOver(UnicodeSet.SIMPLE_CASE_INSENSITIVE);
+ // From-first-principles implementation.
+ expected.clear().add(c);
+ Collection<Integer> values = closure.get(c);
+ if (values != null) {
+ for (Integer t : values) {
+ expected.add(t);
+ }
+ }
+ // compare
+ if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
+ errln(" c=U+" + Utility.hex(c));
+ if (++errors == 10) {
+ break;
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestCloseOverLargeSets() {
+ // Check that an optimization for large sets does not change the result.
+
+ // Most code points except ones that are boring for case mappings.
+ UnicodeSet manyCp = new UnicodeSet("[^[:C:][:Ideographic:][:Hang:]]");
+ // Main Unihan block.
+ int LARGE_START = 0x4E00;
+ int LARGE_END = 0x9FFF;
+
+ int OPTIONS[] = {
+ UnicodeSet.CASE_INSENSITIVE, UnicodeSet.ADD_CASE_MAPPINGS,
+ UnicodeSet.SIMPLE_CASE_INSENSITIVE
+ };
+ UnicodeSet input = new UnicodeSet(), small, large;
+ for (int option : OPTIONS) {
+ UnicodeSetIterator iter = new UnicodeSetIterator(manyCp);
+ while (iter.next()) {
+ int c = iter.codepoint;
+ input.clear().add(c);
+ small = (UnicodeSet) input.clone();
+ small.closeOver(option);
+ large = (UnicodeSet) input.clone();
+ large.add(LARGE_START, LARGE_END);
+ large.closeOver(option);
+ large.remove(LARGE_START, LARGE_END);
+ if (!checkEqual(small, large, "small != large")) {
+ errln(" option=" + option + " c=U+" + Utility.hex(c));
+ break;
+ }
+ }
+ }
}
@Test
test2.add("a" + (max - i)); // add in reverse order
}
assertNotEquals("compare iterable test", test1, test2);
- TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1);
- TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2);
+ TreeSet<CharSequence> sortedTest1 = new TreeSet<>(test1);
+ TreeSet<CharSequence> sortedTest2 = new TreeSet<>(test2);
assertEquals("compare iterable test", sortedTest1, sortedTest2);
}