unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o loadednormalizer2impl.o \
chariter.o schriter.o uchriter.o uiter.o \
-patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
+patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o characterproperties.o \
+ubidi.o ubidiwrt.o ubidiln.o ushape.o \
uscript.o uscript_props.o usc_impl.o unames.o \
utrie.o utrie2.o utrie2_builder.o ucptrie.o umutablecptrie.o \
bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// characterproperties.cpp
+// created: 2018sep03 Markus W. Scherer
+
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/uchar.h"
+#include "unicode/ucpmap.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uniset.h"
+#include "unicode/uscript.h"
+#include "unicode/uset.h"
+#include "cmemory.h"
+#include "mutex.h"
+#include "normalizer2impl.h"
+#include "uassert.h"
+#include "ubidi_props.h"
+#include "ucase.h"
+#include "ucln_cmn.h"
+#include "umutex.h"
+#include "uprops.h"
+
+using icu::UInitOnce;
+using icu::UnicodeSet;
+
+namespace {
+
+U_CDECL_BEGIN
+
+UBool U_CALLCONV characterproperties_cleanup();
+
+struct Inclusion {
+ UnicodeSet *fSet;
+ UInitOnce fInitOnce;
+};
+Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
+
+UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
+
+UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
+
+UMutex cpMutex = U_MUTEX_INITIALIZER;
+
+//----------------------------------------------------------------
+// Inclusions list
+//----------------------------------------------------------------
+
+// USetAdder implementation
+// Does not use uset.h to reduce code dependencies
+void U_CALLCONV
+_set_add(USet *set, UChar32 c) {
+ ((UnicodeSet *)set)->add(c);
+}
+
+void U_CALLCONV
+_set_addRange(USet *set, UChar32 start, UChar32 end) {
+ ((UnicodeSet *)set)->add(start, end);
+}
+
+void U_CALLCONV
+_set_addString(USet *set, const UChar *str, int32_t length) {
+ ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
+}
+
+UBool U_CALLCONV characterproperties_cleanup() {
+ for (Inclusion &in: gInclusions) {
+ delete in.fSet;
+ in.fSet = nullptr;
+ in.fInitOnce.reset();
+ }
+ for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
+ delete sets[i];
+ sets[i] = nullptr;
+ }
+ for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
+ ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
+ maps[i] = nullptr;
+ }
+ return TRUE;
+}
+
+U_CDECL_END
+
+} // namespace
+
+U_NAMESPACE_BEGIN
+
+/*
+Reduce excessive reallocation, and make it easier to detect initialization problems.
+Usually you don't see smaller sets than this for Unicode 5.0.
+*/
+constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072;
+
+void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) {
+ // This function is invoked only via umtx_initOnce().
+ // This function is a friend of class UnicodeSet.
+
+ U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
+ if (src == UPROPS_SRC_NONE) {
+ errorCode = U_INTERNAL_PROGRAM_ERROR;
+ return;
+ }
+ UnicodeSet * &incl = gInclusions[src].fSet;
+ U_ASSERT(incl == nullptr);
+
+ incl = new UnicodeSet();
+ if (incl == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ USetAdder sa = {
+ (USet *)incl,
+ _set_add,
+ _set_addRange,
+ _set_addString,
+ nullptr, // don't need remove()
+ nullptr // don't need removeRange()
+ };
+
+ incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode);
+ switch(src) {
+ case UPROPS_SRC_CHAR:
+ uchar_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_PROPSVEC:
+ upropsvec_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_CHAR_AND_PROPSVEC:
+ uchar_addPropertyStarts(&sa, &errorCode);
+ upropsvec_addPropertyStarts(&sa, &errorCode);
+ break;
+#if !UCONFIG_NO_NORMALIZATION
+ case UPROPS_SRC_CASE_AND_NORM: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ ucase_addPropertyStarts(&sa, &errorCode);
+ break;
+ }
+ case UPROPS_SRC_NFC: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFKC: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFKC_CF: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+ case UPROPS_SRC_NFC_CANON_ITER: {
+ const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+ if(U_SUCCESS(errorCode)) {
+ impl->addCanonIterPropertyStarts(&sa, errorCode);
+ }
+ break;
+ }
+#endif
+ case UPROPS_SRC_CASE:
+ ucase_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_BIDI:
+ ubidi_addPropertyStarts(&sa, &errorCode);
+ break;
+ case UPROPS_SRC_INPC:
+ case UPROPS_SRC_INSC:
+ case UPROPS_SRC_VO:
+ uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
+ break;
+ default:
+ errorCode = U_INTERNAL_PROGRAM_ERROR;
+ break;
+ }
+
+ if (U_FAILURE(errorCode)) {
+ delete incl;
+ incl = nullptr;
+ return;
+ }
+ // Compact for caching
+ incl->compact();
+ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
+}
+
+const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ if (src < 0 || UPROPS_SRC_COUNT <= src) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Inclusion &i = gInclusions[src];
+ umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode);
+ return i.fSet;
+}
+
+const UnicodeSet *CharacterProperties::getInclusionsForProperty(
+ UProperty prop, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ UPropertySource src = uprops_getSource(prop);
+ return getInclusionsForSource(src, errorCode);
+}
+
+U_NAMESPACE_END
+
+namespace {
+
+UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ icu::LocalPointer<UnicodeSet> set(new UnicodeSet());
+ if (set.isNull()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+ const UnicodeSet *inclusions =
+ icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ int32_t numRanges = inclusions->getRangeCount();
+ UChar32 startHasProperty = -1;
+
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = inclusions->getRangeEnd(i);
+ for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
+ if (u_hasBinaryProperty(c, property)) {
+ if (startHasProperty < 0) {
+ // Transition from false to true.
+ startHasProperty = c;
+ }
+ } else if (startHasProperty >= 0) {
+ // Transition from true to false.
+ set->add(startHasProperty, c - 1);
+ startHasProperty = -1;
+ }
+ }
+ }
+ if (startHasProperty >= 0) {
+ set->add(startHasProperty, 0x10FFFF);
+ }
+ set->freeze();
+ return set.orphan();
+}
+
+UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
+ icu::LocalUMutableCPTriePointer mutableTrie(
+ umutablecptrie_open(nullValue, nullValue, &errorCode));
+ const UnicodeSet *inclusions =
+ icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+ if (U_FAILURE(errorCode)) { return nullptr; }
+ int32_t numRanges = inclusions->getRangeCount();
+ UChar32 start = 0;
+ uint32_t value = nullValue;
+
+ for (int32_t i = 0; i < numRanges; ++i) {
+ UChar32 rangeEnd = inclusions->getRangeEnd(i);
+ for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+ uint32_t nextValue = u_getIntPropertyValue(c, property);
+ if (value != nextValue) {
+ if (value != nullValue) {
+ umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
+ }
+ start = c;
+ value = nextValue;
+ }
+ }
+ }
+ if (value != 0) {
+ umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
+ }
+
+ UCPTrieType type;
+ if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
+ type = UCPTRIE_TYPE_FAST;
+ } else {
+ type = UCPTRIE_TYPE_SMALL;
+ }
+ UCPTrieValueWidth valueWidth;
+ // TODO: UCharacterProperty.IntProperty
+ int32_t max = u_getIntPropertyMaxValue(property);
+ if (max <= 0xff) {
+ valueWidth = UCPTRIE_VALUE_BITS_8;
+ } else if (max <= 0xffff) {
+ valueWidth = UCPTRIE_VALUE_BITS_16;
+ } else {
+ valueWidth = UCPTRIE_VALUE_BITS_32;
+ }
+ return reinterpret_cast<UCPMap *>(
+ umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
+}
+
+} // namespace
+
+U_NAMESPACE_USE
+
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Mutex m(&cpMutex);
+ UnicodeSet *set = sets[property];
+ if (set == nullptr) {
+ sets[property] = set = makeSet(property, *pErrorCode);
+ }
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ return set->toUSet();
+}
+
+U_CAPI const UCPMap * U_EXPORT2
+u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) { return nullptr; }
+ if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ Mutex m(&cpMutex);
+ UCPMap *map = maps[property - UCHAR_INT_START];
+ if (map == nullptr) {
+ maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
+ }
+ return map;
+}
<ClCompile Include="ruleiter.cpp" />
<ClCompile Include="ucase.cpp" />
<ClCompile Include="uchar.cpp" />
+ <ClCompile Include="characterproperties.cpp" />
<ClCompile Include="unames.cpp" />
<ClCompile Include="unifiedcache.cpp" />
<ClCompile Include="unifilt.cpp" />
<ClCompile Include="bmpset.cpp">
<Filter>properties & sets</Filter>
</ClCompile>
+ <ClCompile Include="characterproperties.cpp">
+ <Filter>properties & sets</Filter>
+ </ClCompile>
<ClCompile Include="propname.cpp">
<Filter>properties & sets</Filter>
</ClCompile>
<ClCompile Include="ruleiter.cpp" />
<ClCompile Include="ucase.cpp" />
<ClCompile Include="uchar.cpp" />
+ <ClCompile Include="characterproperties.cpp" />
<ClCompile Include="unames.cpp" />
<ClCompile Include="unifiedcache.cpp" />
<ClCompile Include="unifilt.cpp" />
// private mutex where possible.
// For example:
-//
-// UMutex myMutex;
-//
+//
+// UMutex myMutex = U_MUTEX_INITIALIZER;
+//
// void Function(int arg1, int arg2)
// {
// static Object* foo; // Shared read-write object
Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
UChar32 start = 0, end;
uint32_t norm16;
- while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT,
+ while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
nullptr, nullptr, &norm16)) >= 0) {
if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
norm16 != Normalizer2Impl::JAMO_VT) {
// Add the start code point of each same-value range of the trie.
UChar32 start = 0, end;
uint32_t value;
- while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT,
+ while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
nullptr, nullptr, &value)) >= 0) {
sa->add(sa->set, start);
if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
// Currently only used for the SEGMENT_STARTER property.
UChar32 start = 0, end;
uint32_t value;
- while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
segmentStarterMapper, nullptr, &value)) >= 0) {
sa->add(sa->set, start);
start = end + 1;
UChar32 start = 0, end;
uint32_t value;
while ((end = ucptrie_getRange(impl->normTrie, start,
- UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
+ UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
nullptr, nullptr, &value)) >= 0) {
// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
if (value != Normalizer2Impl::INERT) {
#include "unicode/utf.h"
#include "unicode/utf16.h"
#include "mutex.h"
+#include "udataswp.h"
#include "uset_imp.h"
// When the nfc.nrm data is *not* hardcoded into the common library
UCLN_COMMON_CURRENCY,
UCLN_COMMON_LOADED_NORMALIZER2,
UCLN_COMMON_NORMALIZER2,
+ UCLN_COMMON_CHARACTERPROPERTIES,
UCLN_COMMON_USET,
UCLN_COMMON_UNAMES,
UCLN_COMMON_UPROPS,
constexpr int32_t MAX_UNICODE = 0x10ffff;
inline uint32_t maybeFilterValue(uint32_t value, uint32_t trieNullValue, uint32_t nullValue,
- UCPTrieValueFilter *filter, const void *context) {
+ UCPMapValueFilter *filter, const void *context) {
if (value == trieNullValue) {
value = nullValue;
} else if (filter != nullptr) {
}
UChar32 getRange(const void *t, UChar32 start,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
if ((uint32_t)start > MAX_UNICODE) {
return U_SENTINEL;
}
U_CFUNC UChar32
ucptrie_internalGetRange(UCPTrieGetRange *getRange,
const void *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
- if (option == UCPTRIE_RANGE_NORMAL) {
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ if (option == UCPMAP_RANGE_NORMAL) {
return getRange(trie, start, filter, context, pValue);
}
uint32_t value;
// We need to examine the range value even if the caller does not want it.
pValue = &value;
}
- UChar32 surrEnd = option == UCPTRIE_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
+ UChar32 surrEnd = option == UCPMAP_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
UChar32 end = getRange(trie, start, filter, context, pValue);
if (end < 0xd7ff || start > surrEnd) {
return end;
U_CAPI UChar32 U_EXPORT2
ucptrie_getRange(const UCPTrie *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
return ucptrie_internalGetRange(getRange, trie, start,
option, surrogateValue,
filter, context, pValue);
#endif
} // namespace
+
+// UCPMap ----
+// Initially, this is the same as UCPTrie. This may well change.
+
+U_CAPI uint32_t U_EXPORT2
+ucpmap_get(const UCPMap *map, UChar32 c) {
+ return ucptrie_get(reinterpret_cast<const UCPTrie *>(map), c);
+}
+
+U_CAPI UChar32 U_EXPORT2
+ucpmap_getRange(const UCPMap *map, UChar32 start,
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ return ucptrie_getRange(reinterpret_cast<const UCPTrie *>(map), start,
+ option, surrogateValue,
+ filter, context, pValue);
+}
typedef UChar32
UCPTrieGetRange(const void *trie, UChar32 start,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
U_CFUNC UChar32
ucptrie_internalGetRange(UCPTrieGetRange *getRange,
const void *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
#ifdef UCPTRIE_DEBUG
U_CFUNC void
MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
+ static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
uint32_t get(UChar32 c) const;
- int32_t getRange(UChar32 start, UCPTrieValueFilter *filter, const void *context,
+ int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
uint32_t *pValue) const;
void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
uprv_free(index16);
}
+MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
+ // Use the highValue as the initialValue to reduce the highStart.
+ uint32_t errorValue = ucpmap_get(map, -1);
+ uint32_t initialValue = ucpmap_get(map, 0x10ffff);
+ LocalPointer<MutableCodePointTrie> mutableTrie(
+ new MutableCodePointTrie(initialValue, errorValue, errorCode),
+ errorCode);
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &value)) >= 0) {
+ if (value != initialValue) {
+ if (start == end) {
+ mutableTrie->set(start, value, errorCode);
+ } else {
+ mutableTrie->setRange(start, end, value, errorCode);
+ }
+ }
+ start = end + 1;
+ }
+ if (U_SUCCESS(errorCode)) {
+ return mutableTrie.orphan();
+ } else {
+ return nullptr;
+ }
+}
+
MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
// Use the highValue as the initialValue to reduce the highStart.
uint32_t errorValue;
}
UChar32 start = 0, end;
uint32_t value;
- while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value)) >= 0) {
if (value != initialValue) {
if (start == end) {
}
inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
- UCPTrieValueFilter *filter, const void *context) {
+ UCPMapValueFilter *filter, const void *context) {
if (value == initialValue) {
value = nullValue;
} else if (filter != nullptr) {
}
UChar32 MutableCodePointTrie::getRange(
- UChar32 start, UCPTrieValueFilter *filter, const void *context,
+ UChar32 start, UCPMapValueFilter *filter, const void *context,
uint32_t *pValue) const {
if ((uint32_t)start > MAX_UNICODE) {
return U_SENTINEL;
delete reinterpret_cast<MutableCodePointTrie *>(trie);
}
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (map == nullptr) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
+}
+
U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
if (U_FAILURE(*pErrorCode)) {
namespace {
UChar32 getRange(const void *trie, UChar32 start,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
return reinterpret_cast<const MutableCodePointTrie *>(trie)->
getRange(start, filter, context, pValue);
}
U_CAPI UChar32 U_EXPORT2
umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
return ucptrie_internalGetRange(getRange, trie, start,
option, surrogateValue,
filter, context, pValue);
#include "unicode/utypes.h"
#include "unicode/stringoptions.h"
+#include "unicode/ucpmap.h"
+
+#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN)
+
+#define USET_DEFINED
+
+/**
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * It is forward-declared here to avoid including unicode/uset.h file if related
+ * APIs are not used.
+ *
+ * @see ucnv_getUnicodeSet
+ * @stable ICU 2.4
+ */
+typedef struct USet USet;
+
+#endif
+
U_CDECL_BEGIN
* "About the Unicode Character Database" (http://www.unicode.org/ucd/)
* and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html).
*
+ * Many properties are accessible via generic functions that take a UProperty selector.
+ * - u_hasBinaryProperty() returns a binary value (TRUE/FALSE) per property and code point.
+ * - u_getIntPropertyValue() returns an integer value per property and code point.
+ * For each supported enumerated or catalog property, there is
+ * an enum type for all of the property's values, and
+ * u_getIntPropertyValue() returns the numeric values of those constants.
+ * - u_getBinaryPropertySet() returns a set for each ICU-supported binary property with
+ * all code points for which the property is true.
+ * - u_getIntPropertyMap() returns a map for each
+ * ICU-supported enumerated/catalog/int-valued property which
+ * maps all Unicode code points to their values for that property.
+ *
* Many functions are designed to match java.lang.Character functions.
* See the individual function documentation,
* and see the JDK 1.4 java.lang.Character documentation
* does not have data for the property at all, or not for this code point.
*
* @see UProperty
+ * @see u_getBinaryPropertySet
* @see u_getIntPropertyValue
* @see u_getUnicodeVersion
* @stable ICU 2.1
U_STABLE UBool U_EXPORT2
u_hasBinaryProperty(UChar32 c, UProperty which);
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Returns a frozen USet for a binary property.
+ * The library retains ownership over the returned object.
+ * Sets an error code if the property number is not one for a binary property.
+ *
+ * The returned set contains all code points for which the property is true.
+ *
+ * @param property UCHAR_BINARY_START..UCHAR_BINARY_LIMIT-1
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the property as a set
+ * @see UProperty
+ * @see u_hasBinaryProperty
+ * @see Unicode::fromUSet
+ */
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode);
+
+#endif // U_HIDE_DRAFT_API
+
/**
* Check if a code point has the Alphabetic Unicode property.
* Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC).
* @see u_hasBinaryProperty
* @see u_getIntPropertyMinValue
* @see u_getIntPropertyMaxValue
+ * @see u_getIntPropertyMap
* @see u_getUnicodeVersion
* @stable ICU 2.2
*/
U_STABLE int32_t U_EXPORT2
u_getIntPropertyMaxValue(UProperty which);
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Returns an immutable UCPMap for an enumerated/catalog/int-valued property.
+ * The library retains ownership over the returned object.
+ * Sets an error code if the property number is not one for an "int property".
+ *
+ * The returned object maps all Unicode code points to their values for that property.
+ * For documentation of the integer values see u_getIntPropertyValue().
+ *
+ * @param property UCHAR_INT_START..UCHAR_INT_LIMIT-1
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the property as a map
+ * @see UProperty
+ * @see u_getIntPropertyValue
+ */
+U_CAPI const UCPMap * U_EXPORT2
+u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode);
+
+#endif // U_HIDE_DRAFT_API
+
/**
* Get the numeric value for a Unicode code point as defined in the
* Unicode Character Database.
#include "unicode/uenum.h"
#include "unicode/localpointer.h"
-#ifndef __USET_H__
+#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN)
+
+#define USET_DEFINED
/**
- * USet is the C API type for Unicode sets.
- * It is forward-declared here to avoid including the header file if related
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * It is forward-declared here to avoid including unicode/uset.h file if related
* conversion APIs are not used.
- * See unicode/uset.h
*
* @see ucnv_getUnicodeSet
- * @stable ICU 2.6
+ * @stable ICU 2.4
*/
-struct USet;
-/** @stable ICU 2.6 */
typedef struct USet USet;
#endif
--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// ucpmap.h
+// created: 2018sep03 Markus W. Scherer
+
+#ifndef __UCPMAP_H__
+#define __UCPMAP_H__
+
+#include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
+U_CDECL_BEGIN
+
+/**
+ * \file
+ *
+ * This file defines an abstract map from Unicode code points to integer values.
+ *
+ * @see UCPMap
+ * @see UCPTrie
+ * @see UMutableCPTrie
+ */
+
+/**
+ * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
+ *
+ * @see UCPTrie
+ * @see UMutableCPTrie
+ * @draft ICU 63
+ */
+typedef struct UCPMap UCPMap;
+
+/**
+ * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates.
+ * Most users should use UCPMAP_RANGE_NORMAL.
+ *
+ * @see ucpmap_getRange
+ * @see ucptrie_getRange
+ * @see umutablecptrie_getRange
+ * @draft ICU 63
+ */
+enum UCPMapRangeOption {
+ /**
+ * ucpmap_getRange() enumerates all same-value ranges as stored in the map.
+ * Most users should use this option.
+ */
+ UCPMAP_RANGE_NORMAL,
+ /**
+ * ucpmap_getRange() enumerates all same-value ranges as stored in the map,
+ * except that lead surrogates (U+D800..U+DBFF) are treated as having the
+ * surrogateValue, which is passed to getRange() as a separate parameter.
+ * The surrogateValue is not transformed via filter().
+ * See U_IS_LEAD(c).
+ *
+ * Most users should use UCPMAP_RANGE_NORMAL instead.
+ *
+ * This option is useful for maps that map surrogate code *units* to
+ * special values optimized for UTF-16 string processing
+ * or for special error behavior for unpaired surrogates,
+ * but those values are not to be associated with the lead surrogate code *points*.
+ */
+ UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
+ /**
+ * ucpmap_getRange() enumerates all same-value ranges as stored in the map,
+ * except that all surrogates (U+D800..U+DFFF) are treated as having the
+ * surrogateValue, which is passed to getRange() as a separate parameter.
+ * The surrogateValue is not transformed via filter().
+ * See U_IS_SURROGATE(c).
+ *
+ * Most users should use UCPMAP_RANGE_NORMAL instead.
+ *
+ * This option is useful for maps that map surrogate code *units* to
+ * special values optimized for UTF-16 string processing
+ * or for special error behavior for unpaired surrogates,
+ * but those values are not to be associated with the lead surrogate code *points*.
+ */
+ UCPMAP_RANGE_FIXED_ALL_SURROGATES
+};
+#ifndef U_IN_DOXYGEN
+typedef enum UCPMapRangeOption UCPMapRangeOption;
+#endif
+
+/**
+ * Returns the value for a code point as stored in the map, with range checking.
+ * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
+ *
+ * @param map the map
+ * @param c the code point
+ * @return the map value,
+ * or an implementation-defined error value if the code point is not in the range 0..U+10FFFF
+ * @draft ICU 63
+ */
+U_CAPI uint32_t U_EXPORT2
+ucpmap_get(const UCPMap *map, UChar32 c);
+
+/**
+ * Callback function type: Modifies a map value.
+ * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange().
+ * The modified value will be returned by the getRange function.
+ *
+ * Can be used to ignore some of the value bits,
+ * make a filter for one of several values,
+ * return a value index computed from the map value, etc.
+ *
+ * @param context an opaque pointer, as passed into the getRange function
+ * @param value a value from the map
+ * @return the modified value
+ * @draft ICU 63
+ */
+typedef uint32_t U_CALLCONV
+UCPMapValueFilter(const void *context, uint32_t value);
+
+/**
+ * Returns the last code point such that all those from start to there have the same value.
+ * Can be used to efficiently iterate over all same-value ranges in a map.
+ * (This is normally faster than iterating over code points and get()ting each value,
+ * but much slower than a data structure that stores ranges directly.)
+ *
+ * If the UCPMapValueFilter function pointer is not NULL, then
+ * the value to be delivered is passed through that function, and the return value is the end
+ * of the range where all values are modified to the same actual value.
+ * The value is unchanged if that function pointer is NULL.
+ *
+ * Example:
+ * \code
+ * UChar32 start = 0, end;
+ * uint32_t value;
+ * while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ * NULL, NULL, &value)) >= 0) {
+ * // Work with the range start..end and its value.
+ * start = end + 1;
+ * }
+ * \endcode
+ *
+ * @param map the map
+ * @param start range start
+ * @param option defines whether surrogates are treated normally,
+ * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
+ * @param filter a pointer to a function that may modify the map data value,
+ * or NULL if the values from the map are to be used unmodified
+ * @param context an opaque pointer that is passed on to the filter function
+ * @param pValue if not NULL, receives the value that every code point start..end has;
+ * may have been modified by filter(context, map value)
+ * if that function pointer is not NULL
+ * @return the range end code point, or -1 if start is not a valid code point
+ * @draft ICU 63
+ */
+U_CAPI UChar32 U_EXPORT2
+ucpmap_getRange(const UCPMap *map, UChar32 start,
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
+
+U_CDECL_END
+
+#endif // U_HIDE_DRAFT_API
+#endif
#define __UCPTRIE_H__
#include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
#include "unicode/localpointer.h"
+#include "unicode/ucpmap.h"
#include "unicode/utf8.h"
-#include "putilimp.h"
-#include "udataswp.h"
U_CDECL_BEGIN
typedef enum UCPTrieValueWidth UCPTrieValueWidth;
#endif
-/**
- * Selectors for how ucptrie_getRange() should report value ranges overlapping with surrogates.
- * Most users should use UCPTRIE_RANGE_NORMAL.
- *
- * @see ucptrie_getRange
- * @draft ICU 63
- */
-enum UCPTrieRangeOption {
- /**
- * ucptrie_getRange() enumerates all same-value ranges as stored in the trie.
- * Most users should use this option.
- */
- UCPTRIE_RANGE_NORMAL,
- /**
- * ucptrie_getRange() enumerates all same-value ranges as stored in the trie,
- * except that lead surrogates (U+D800..U+DBFF) are treated as having the
- * surrogateValue, which is passed to getRange() as a separate parameter.
- * The surrogateValue is not transformed via filter().
- * See U_IS_LEAD(c).
- *
- * Most users should use UCPTRIE_RANGE_NORMAL instead.
- *
- * This option is useful for tries that map surrogate code *units* to
- * special values optimized for UTF-16 string processing
- * or for special error behavior for unpaired surrogates,
- * but those values are not to be associated with the lead surrogate code *points*.
- */
- UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
- /**
- * ucptrie_getRange() enumerates all same-value ranges as stored in the trie,
- * except that all surrogates (U+D800..U+DFFF) are treated as having the
- * surrogateValue, which is passed to getRange() as a separate parameter.
- * The surrogateValue is not transformed via filter().
- * See U_IS_SURROGATE(c).
- *
- * Most users should use UCPTRIE_RANGE_NORMAL instead.
- *
- * This option is useful for tries that map surrogate code *units* to
- * special values optimized for UTF-16 string processing
- * or for special error behavior for unpaired surrogates,
- * but those values are not to be associated with the lead surrogate code *points*.
- */
- UCPTRIE_RANGE_FIXED_ALL_SURROGATES
-};
-#ifndef U_IN_DOXYGEN
-typedef enum UCPTrieRangeOption UCPTrieRangeOption;
-#endif
-
/**
* Opens a trie from its binary form, stored in 32-bit-aligned memory.
* Inverse of ucptrie_toBinary().
U_CAPI uint32_t U_EXPORT2
ucptrie_get(const UCPTrie *trie, UChar32 c);
-/**
- * Callback function type: Modifies a trie value.
- * Optionally called by ucptrie_getRange() or umutablecptrie_getRange().
- * The modified value will be returned by the getRange function.
- *
- * Can be used to ignore some of the value bits,
- * make a filter for one of several values,
- * return a value index computed from the trie value, etc.
- *
- * @param context an opaque pointer, as passed into the getRange function
- * @param value a value from the trie
- * @return the modified value
- * @draft ICU 63
- */
-typedef uint32_t U_CALLCONV
-UCPTrieValueFilter(const void *context, uint32_t value);
-
/**
* Returns the last code point such that all those from start to there have the same value.
* Can be used to efficiently iterate over all same-value ranges in a trie.
* (This is normally faster than iterating over code points and get()ting each value,
* but much slower than a data structure that stores ranges directly.)
*
- * If the UCPTrieValueFilter function pointer is not NULL, then
+ * If the UCPMapValueFilter function pointer is not NULL, then
* the value to be delivered is passed through that function, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that function pointer is NULL.
* \code
* UChar32 start = 0, end;
* uint32_t value;
- * while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
* NULL, NULL, &value)) >= 0) {
* // Work with the range start..end and its value.
* start = end + 1;
* @param trie the trie
* @param start range start
* @param option defines whether surrogates are treated normally,
- * or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL
- * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL
+ * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
* @param filter a pointer to a function that may modify the trie data value,
* or NULL if the values from the trie are to be used unmodified
* @param context an opaque pointer that is passed on to the filter function
*/
U_CAPI UChar32 U_EXPORT2
ucptrie_getRange(const UCPTrie *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
/**
* Writes a memory-mappable form of the trie into 32-bit aligned memory.
U_CDECL_END
#endif // U_IN_DOXYGEN
+#endif // U_HIDE_DRAFT_API
#endif
#define __UMUTABLECPTRIE_H__
#include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
#include "unicode/localpointer.h"
+#include "unicode/ucpmap.h"
#include "unicode/ucptrie.h"
#include "unicode/utf8.h"
-#include "putilimp.h"
-#include "udataswp.h"
U_CDECL_BEGIN
#endif
+/**
+ * Creates a mutable trie with the same contents as the UCPMap.
+ * You must umutablecptrie_close() the mutable trie once you are done using it.
+ *
+ * @param map the source map
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the mutable trie
+ * @draft ICU 63
+ */
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode);
+
/**
* Creates a mutable trie with the same contents as the immutable one.
* You must umutablecptrie_close() the mutable trie once you are done using it.
*
* The trie can be modified between calls to this function.
*
- * If the UCPTrieValueFilter function pointer is not NULL, then
+ * If the UCPMapValueFilter function pointer is not NULL, then
* the value to be delivered is passed through that function, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that function pointer is NULL.
* @param trie the trie
* @param start range start
* @param option defines whether surrogates are treated normally,
- * or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL
- * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL
+ * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
* @param filter a pointer to a function that may modify the trie data value,
* or NULL if the values from the trie are to be used unmodified
* @param context an opaque pointer that is passed on to the filter function
*/
U_CAPI UChar32 U_EXPORT2
umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
- UCPTrieRangeOption option, uint32_t surrogateValue,
- UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
/**
* Sets a value for a code point.
U_CDECL_END
+#endif // U_HIDE_DRAFT_API
#endif
#ifndef UNICODESET_H
#define UNICODESET_H
+#include "unicode/ucpmap.h"
#include "unicode/unifilt.h"
#include "unicode/unistr.h"
#include "unicode/uset.h"
U_NAMESPACE_BEGIN
// Forward Declarations.
-void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */
-
class BMPSet;
+class CharacterProperties;
class ParsePosition;
class RBBIRuleScanner;
class SymbolTable;
//----------------------------------------------------------------
/**
- * Make this object represent the range <code>start - end</code>.
- * If <code>end > start</code> then this object is set to an
- * an empty range.
+ * Make this object represent the range `start - end`.
+ * If `end > start` then this object is set to an empty range.
* A frozen set will not be modified.
*
* @param start first character in the set, inclusive
//----------------------------------------------------------------
UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
+ UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
//----------------------------------------------------------------
// Implementation: Pattern parsing
UnicodeString& rebuiltPat,
UErrorCode& ec);
- friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
+ friend class CharacterProperties;
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
/**
*/
void applyFilter(Filter filter,
void* context,
- int32_t src,
+ const UnicodeSet* inclusions,
UErrorCode &status);
+ void applyIntPropertyValue(const UCPMap *map,
+ UCPMapValueFilter *filter, const void *context,
+ UErrorCode &errorCode);
+
/**
* Set the new pattern to cache.
*/
#include "unicode/uchar.h"
#include "unicode/localpointer.h"
-#ifndef UCNV_H
-struct USet;
+#ifndef USET_DEFINED
+
+#ifndef U_IN_DOXYGEN
+#define USET_DEFINED
+#endif
/**
- * A UnicodeSet. Use the uset_* API to manipulate. Create with
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * Use the uset_* API to manipulate. Create with
* uset_open*, and destroy with uset_close.
* @stable ICU 2.4
*/
* Assigns this object to be a copy of another.
*/
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
+ return copyFrom(o, FALSE);
+}
+
+UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
if (this == &o) {
return *this;
}
}
len = o.len;
uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
- if (o.bmpSet == NULL) {
+ if (o.bmpSet == NULL || asThawed) {
bmpSet = NULL;
} else {
bmpSet = new BMPSet(*o.bmpSet, list, len);
setToBogus();
return *this;
}
- if (o.stringSpan == NULL) {
+ if (o.stringSpan == NULL || asThawed) {
stringSpan = NULL;
} else {
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
#include "uprops.h"
#include "propname.h"
#include "normalizer2impl.h"
-#include "ucase.h"
-#include "ubidi_props.h"
#include "uinvchar.h"
#include "uprops.h"
#include "charstr.h"
U_CDECL_BEGIN
static UBool U_CALLCONV uset_cleanup();
-struct Inclusion {
- UnicodeSet *fSet;
- UInitOnce fInitOnce;
-};
-static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
-
static UnicodeSet *uni32Singleton;
static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
-//----------------------------------------------------------------
-// Inclusions list
-//----------------------------------------------------------------
-
-// USetAdder implementation
-// Does not use uset.h to reduce code dependencies
-static void U_CALLCONV
-_set_add(USet *set, UChar32 c) {
- ((UnicodeSet *)set)->add(c);
-}
-
-static void U_CALLCONV
-_set_addRange(USet *set, UChar32 start, UChar32 end) {
- ((UnicodeSet *)set)->add(start, end);
-}
-
-static void U_CALLCONV
-_set_addString(USet *set, const UChar *str, int32_t length) {
- ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
-}
-
/**
* Cleanup function for UnicodeSet
*/
static UBool U_CALLCONV uset_cleanup(void) {
- for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
- Inclusion &in = gInclusions[i];
- delete in.fSet;
- in.fSet = NULL;
- in.fInitOnce.reset();
- }
-
delete uni32Singleton;
uni32Singleton = NULL;
uni32InitOnce.reset();
U_NAMESPACE_BEGIN
-/*
-Reduce excessive reallocation, and make it easier to detect initialization problems.
-Usually you don't see smaller sets than this for Unicode 5.0.
-*/
-#define DEFAULT_INCLUSION_CAPACITY 3072
-
-void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
- // This function is invoked only via umtx_initOnce().
- // This function is a friend of class UnicodeSet.
-
- U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
- UnicodeSet * &incl = gInclusions[src].fSet;
- U_ASSERT(incl == NULL);
-
- incl = new UnicodeSet();
- if (incl == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- USetAdder sa = {
- (USet *)incl,
- _set_add,
- _set_addRange,
- _set_addString,
- NULL, // don't need remove()
- NULL // don't need removeRange()
- };
-
- incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
- switch(src) {
- case UPROPS_SRC_CHAR:
- uchar_addPropertyStarts(&sa, &status);
- break;
- case UPROPS_SRC_PROPSVEC:
- upropsvec_addPropertyStarts(&sa, &status);
- break;
- case UPROPS_SRC_CHAR_AND_PROPSVEC:
- uchar_addPropertyStarts(&sa, &status);
- upropsvec_addPropertyStarts(&sa, &status);
- break;
-#if !UCONFIG_NO_NORMALIZATION
- case UPROPS_SRC_CASE_AND_NORM: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
- if(U_SUCCESS(status)) {
- impl->addPropertyStarts(&sa, status);
- }
- ucase_addPropertyStarts(&sa, &status);
- break;
- }
- case UPROPS_SRC_NFC: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
- if(U_SUCCESS(status)) {
- impl->addPropertyStarts(&sa, status);
- }
- break;
- }
- case UPROPS_SRC_NFKC: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
- if(U_SUCCESS(status)) {
- impl->addPropertyStarts(&sa, status);
- }
- break;
- }
- case UPROPS_SRC_NFKC_CF: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
- if(U_SUCCESS(status)) {
- impl->addPropertyStarts(&sa, status);
- }
- break;
- }
- case UPROPS_SRC_NFC_CANON_ITER: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
- if(U_SUCCESS(status)) {
- impl->addCanonIterPropertyStarts(&sa, status);
- }
- break;
- }
-#endif
- case UPROPS_SRC_CASE:
- ucase_addPropertyStarts(&sa, &status);
- break;
- case UPROPS_SRC_BIDI:
- ubidi_addPropertyStarts(&sa, &status);
- break;
- case UPROPS_SRC_INPC:
- case UPROPS_SRC_INSC:
- case UPROPS_SRC_VO:
- uprops_addPropertyStarts((UPropertySource)src, &sa, &status);
- break;
- default:
- status = U_INTERNAL_PROGRAM_ERROR;
- break;
- }
-
- if (U_FAILURE(status)) {
- delete incl;
- incl = NULL;
- return;
- }
- // Compact for caching
- incl->compact();
- ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
-}
-
-
-
-const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
- U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
- Inclusion &i = gInclusions[src];
- umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
- return i.fSet;
-}
-
namespace {
// Cache some sets for other services -------------------------------------- ***
return u_getNumericValue(ch) == *(double*)context;
}
-static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
- int32_t value = *(int32_t*)context;
- return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
-}
-
static UBool versionFilter(UChar32 ch, void* context) {
static const UVersionInfo none = { 0, 0, 0, 0 };
UVersionInfo v;
return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
}
-typedef struct {
- UProperty prop;
- int32_t value;
-} IntPropertyContext;
-
-static UBool intPropertyFilter(UChar32 ch, void* context) {
- IntPropertyContext* c = (IntPropertyContext*)context;
- return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
-}
-
static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
return uscript_hasScript(ch, *(UScriptCode*)context);
}
*/
void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
void* context,
- int32_t src,
+ const UnicodeSet* inclusions,
UErrorCode &status) {
if (U_FAILURE(status)) return;
// To improve performance, use an inclusions set which
// encodes information about character ranges that are known
// to have identical properties.
- // getInclusions(src) contains exactly the first characters of
- // same-value ranges for the given properties "source".
- const UnicodeSet* inclusions = getInclusions(src, status);
- if (U_FAILURE(status)) {
- return;
- }
+ // inclusions contains the first characters of
+ // same-value ranges for the given property.
clear();
namespace {
+/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
+uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
+ uint32_t mask = *(const uint32_t *)context;
+ value = U_MASK(value) & mask;
+ if (value != 0) { value = 1; }
+ return value;
+}
+
+/** Maps one map value to 1, all others to 0. */
+uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
+ uint32_t v = *(const uint32_t *)context;
+ return value == v ? 1 : 0;
+}
+
+} // namespace
+
+void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
+ UCPMapValueFilter *filter, const void *context,
+ UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) { return; }
+ clear();
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ filter, context, &value)) >= 0) {
+ if (value != 0) {
+ add(start, end);
+ }
+ start = end + 1;
+ }
+ if (isBogus()) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ }
+}
+
+namespace {
+
static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
/* Note: we use ' ' in compiler code page */
int32_t j = 0;
UnicodeSet&
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
- if (U_FAILURE(ec) || isFrozen()) return *this;
-
+ if (U_FAILURE(ec)) { return *this; }
+ // All of the following check isFrozen() before modifying this set.
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
- applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
+ const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
+ applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
+ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
UScriptCode script = (UScriptCode)value;
- applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
+ applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
+ } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
+ if (value == 0 || value == 1) {
+ const USet *set = u_getBinaryPropertySet(prop, &ec);
+ if (U_FAILURE(ec)) { return *this; }
+ copyFrom(*UnicodeSet::fromUSet(set), TRUE);
+ if (value == 0) {
+ complement();
+ }
+ } else {
+ clear();
+ }
+ } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
+ const UCPMap *map = u_getIntPropertyMap(prop, &ec);
+ applyIntPropertyValue(map, intValueFilter, &value, ec);
} else {
- IntPropertyContext c = {prop, value};
- applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
+ // This code used to always call getInclusions(property source)
+ // which sets an error for an unsupported property.
+ ec = U_ILLEGAL_ARGUMENT_ERROR;
+ // Otherwise we would just clear() this set because
+ // getIntPropertyValue(c, prop) returns 0 for all code points.
}
return *this;
}
if (*end != 0) {
FAIL(ec);
}
- applyFilter(numericValueFilter, &val, UPROPS_SRC_CHAR, ec);
+ applyFilter(numericValueFilter, &val,
+ CharacterProperties::getInclusionsForProperty(p, ec), ec);
return *this;
}
case UCHAR_NAME:
if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
UVersionInfo version;
u_versionFromString(version, buf);
- applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
+ applyFilter(versionFilter, &version,
+ CharacterProperties::getInclusionsForProperty(p, ec), ec);
return *this;
}
case UCHAR_SCRIPT_EXTENSIONS:
// Add the start code point of each same-value range of the trie.
UChar32 start = 0, end;
- while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, nullptr)) >= 0) {
sa->add(sa->set, start);
start = end + 1;
class UnicodeSet;
+class CharacterProperties {
+public:
+ CharacterProperties() = delete;
+ static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode);
+ static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
+};
+
// implemented in uniset_props.cpp
U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode &errorCode);
static void TestPropertyValues(void);
static void TestConsistency(void);
static void TestCaseFolding(void);
+static void TestBinaryCharacterPropertiesAPI(void);
+static void TestIntCharacterPropertiesAPI(void);
/* internal methods used */
static int32_t MakeProp(char* str);
addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
+ addTest(root, &TestBinaryCharacterPropertiesAPI,
+ "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
+ addTest(root, &TestIntCharacterPropertiesAPI,
+ "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
}
/*==================================================== */
uset_close(data.notSeen);
}
+
+static void TestBinaryCharacterPropertiesAPI() {
+ // API test only. See intltest/ucdtest.cpp for functional test.
+ UErrorCode errorCode = U_ZERO_ERROR;
+ const USet *set = u_getBinaryPropertySet(-1, &errorCode);
+ if (U_SUCCESS(errorCode)) {
+ log_err("u_getBinaryPropertySet(-1) did not fail\n");
+ }
+ errorCode = U_ZERO_ERROR;
+ set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
+ if (U_SUCCESS(errorCode)) {
+ log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
+ }
+ errorCode = U_ZERO_ERROR;
+ set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
+ if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
+ log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
+ }
+}
+
+static void TestIntCharacterPropertiesAPI() {
+ // API test only. See intltest/ucdtest.cpp for functional test.
+ UErrorCode errorCode = U_ZERO_ERROR;
+ const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
+ if (U_SUCCESS(errorCode)) {
+ log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
+ }
+ errorCode = U_ZERO_ERROR;
+ map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
+ if (U_SUCCESS(errorCode)) {
+ log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
+ }
+ errorCode = U_ZERO_ERROR;
+ map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
+ if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
+ log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
+ }
+}
static void
testTrieGetRanges(const char *testName, const UCPTrie *trie, const UMutableCPTrie *mutableTrie,
- UCPTrieRangeOption option, uint32_t surrValue,
+ UCPMapRangeOption option, uint32_t surrValue,
const CheckRange checkRanges[], int32_t countCheckRanges) {
const char *const typeName = trie == NULL ? "mutableTrie" : "trie";
- const char *const optionName = option == UCPTRIE_RANGE_NORMAL ? "normal" :
- option == UCPTRIE_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr";
+ const char *const optionName = option == UCPMAP_RANGE_NORMAL ? "normal" :
+ option == UCPMAP_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr";
char name[80];
int32_t s;
for (s = 0; s < UPRV_LENGTHOF(iterStarts); ++s) {
UCPTrieType type, UCPTrieValueWidth valueWidth,
const CheckRange checkRanges[], int32_t countCheckRanges) {
testTrieGetters(testName, trie, type, valueWidth, checkRanges, countCheckRanges);
- testTrieGetRanges(testName, trie, NULL, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
+ testTrieGetRanges(testName, trie, NULL, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
if (type == UCPTRIE_TYPE_FAST) {
testTrieUTF16(testName, trie, valueWidth, checkRanges, countCheckRanges);
testTrieUTF8(testName, trie, valueWidth, checkRanges, countCheckRanges);
testBuilder(const char *testName, const UMutableCPTrie *mutableTrie,
const CheckRange checkRanges[], int32_t countCheckRanges) {
testBuilderGetters(testName, mutableTrie, checkRanges, countCheckRanges);
- testTrieGetRanges(testName, NULL, mutableTrie, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
+ testTrieGetRanges(testName, NULL, mutableTrie, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
}
static uint32_t storage[120000];
}
static void testGetRangesFixedSurr(const char *testName, const UMutableCPTrie *mutableTrie,
- UCPTrieRangeOption option,
+ UCPMapRangeOption option,
const CheckRange checkRanges[], int32_t countCheckRanges) {
testTrieGetRanges(testName, NULL, mutableTrie, option, 5, checkRanges, countCheckRanges);
UErrorCode errorCode = U_ZERO_ERROR;
if (mutableTrie == NULL) {
return;
}
- testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+ testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1));
- testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+ testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
checkRangesFixedAllSurr1, UPRV_LENGTHOF(checkRangesFixedAllSurr1));
// Setting a range in the middle of lead surrogates makes no difference.
umutablecptrie_setRange(mutableTrie, 0xd844, 0xd899, 5, &errorCode);
umutablecptrie_close(mutableTrie);
return;
}
- testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+ testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1));
// Bridge the gap before the lead surrogates.
umutablecptrie_set(mutableTrie, 0xd7ff, 5, &errorCode);
umutablecptrie_close(mutableTrie);
return;
}
- testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+ testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
checkRangesFixedLeadSurr3, UPRV_LENGTHOF(checkRangesFixedLeadSurr3));
- testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+ testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
checkRangesFixedAllSurr3, UPRV_LENGTHOF(checkRangesFixedAllSurr3));
// Bridge the gap after the trail surrogates.
umutablecptrie_set(mutableTrie, 0xe000, 5, &errorCode);
umutablecptrie_close(mutableTrie);
return;
}
- testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+ testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
checkRangesFixedSurr4, UPRV_LENGTHOF(checkRangesFixedSurr4));
umutablecptrie_close(mutableTrie);
}
#include "unicode/ustring.h"
#include "unicode/uchar.h"
+#include "unicode/ucpmap.h"
#include "unicode/uniset.h"
#include "unicode/putil.h"
#include "unicode/uscript.h"
+#include "unicode/uset.h"
#include "cstring.h"
#include "hash.h"
#include "patternprops.h"
#include "normalizer2impl.h"
+#include "testutil.h"
#include "uparse.h"
#include "ucdtest.h"
TESTCASE_AUTO(TestVerticalOrientation);
TESTCASE_AUTO(TestDefaultScriptExtensions);
TESTCASE_AUTO(TestInvalidCodePointFolding);
+ TESTCASE_AUTO(TestBinaryCharacterProperties);
+ TESTCASE_AUTO(TestIntCharacterProperties);
TESTCASE_AUTO_END;
}
cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
}
}
+
+void UnicodeTest::TestBinaryCharacterProperties() {
+ IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
+ // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
+ for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
+ const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
+ if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
+ continue;
+ }
+ const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
+ int32_t size = set.size();
+ if (size == 0) {
+ assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
+ u_hasBinaryProperty(0x20, (UProperty)prop));
+ assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
+ u_hasBinaryProperty(0x61, (UProperty)prop));
+ assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
+ u_hasBinaryProperty(0x4e00, (UProperty)prop));
+ } else {
+ UChar32 c = set.charAt(0);
+ if (c > 0) {
+ assertFalse(
+ UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
+ u", " + prop + u")",
+ u_hasBinaryProperty(c - 1, (UProperty)prop));
+ }
+ assertTrue(
+ UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
+ u", " + prop + u")",
+ u_hasBinaryProperty(c, (UProperty)prop));
+ c = set.charAt(size - 1);
+ assertTrue(
+ UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
+ u", " + prop + u")",
+ u_hasBinaryProperty(c, (UProperty)prop));
+ if (c < 0x10ffff) {
+ assertFalse(
+ UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
+ u", " + prop + u")",
+ u_hasBinaryProperty(c + 1, (UProperty)prop));
+ }
+ }
+ }
+}
+
+void UnicodeTest::TestIntCharacterProperties() {
+ IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
+ // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
+ for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
+ const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
+ if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
+ continue;
+ }
+ uint32_t value;
+ UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
+ assertTrue("int property first range", end >= 0);
+ UChar32 c = end / 2;
+ assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
+ u_getIntPropertyValue(c, (UProperty)prop), value);
+ end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
+ assertTrue("int property later range", end >= 0);
+ assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
+ u_getIntPropertyValue(end, (UProperty)prop), value);
+ // ucpmap_get() API coverage
+ // TODO: move to cucdtst.c
+ assertEquals(
+ "int property upcmap_get(U+0061)",
+ u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
+ }
+}
void TestVerticalOrientation();
void TestDefaultScriptExtensions();
void TestInvalidCodePointFolding();
+ void TestBinaryCharacterProperties();
+ void TestIntCharacterProperties();
private:
// First check that surrogate code *points* are inert.
// The parser should have rejected values/mappings for them.
uint32_t value;
- UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPTRIE_RANGE_NORMAL, 0,
+ UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value);
if (value != Normalizer2Impl::INERT || end < 0xdfff) {
fprintf(stderr,
end = 0;
for (UChar32 start = 0x10000;;) {
if (start > end) {
- end = umutablecptrie_getRange(norm16Trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value);
if (end < 0) { break; }
}
void Norms::enumRanges(Enumerator &e) {
UChar32 start = 0, end;
uint32_t i;
- while ((end = umutablecptrie_getRange(normTrie, start, UCPTRIE_RANGE_NORMAL, 0,
+ while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &i)) >= 0) {
if (i > 0) {
e.rangeHandler(start, end, norms[i]);
--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Properties functionality above class UCharacterProperty
+ * but below class CharacterProperties and class UnicodeSet.
+ */
+public final class CharacterPropertiesImpl {
+ /**
+ * A set of all characters _except_ the second through last characters of
+ * certain ranges. These ranges are ranges of characters whose
+ * properties are all exactly alike, e.g. CJK Ideographs from
+ * U+4E00 to U+9FA5.
+ */
+ private static final UnicodeSet inclusions[] = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+
+ /** For {@link UnicodeSet#setDefaultXSymbolTable}. */
+ public static synchronized void clear() {
+ for (int i = 0; i < inclusions.length; ++i) {
+ inclusions[i] = null;
+ }
+ }
+
+ private static synchronized UnicodeSet getInclusionsForSource(int src) {
+ if (inclusions[src] == null) {
+ UnicodeSet incl = new UnicodeSet();
+ switch(src) {
+ case UCharacterProperty.SRC_CHAR:
+ UCharacterProperty.INSTANCE.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_PROPSVEC:
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
+ UCharacterProperty.INSTANCE.addPropertyStarts(incl);
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_CASE_AND_NORM:
+ Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
+ UCaseProps.INSTANCE.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_NFC:
+ Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_NFKC:
+ Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_NFKC_CF:
+ Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_NFC_CANON_ITER:
+ Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_CASE:
+ UCaseProps.INSTANCE.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_BIDI:
+ UBiDiProps.INSTANCE.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_INPC:
+ case UCharacterProperty.SRC_INSC:
+ case UCharacterProperty.SRC_VO:
+ UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl);
+ break;
+ default:
+ throw new IllegalStateException("getInclusions(unknown src " + src + ")");
+ }
+ // We do not freeze() the set because we only iterate over it,
+ // rather than testing contains(),
+ // so the extra time and memory to optimize that are not necessary.
+ inclusions[src] = incl;
+ }
+ return inclusions[src];
+ }
+
+ /**
+ * Returns a mutable UnicodeSet -- do not modify!
+ */
+ public static UnicodeSet getInclusionsForProperty(int prop) {
+ int src = UCharacterProperty.INSTANCE.getSource(prop);
+ return getInclusionsForSource(src);
+ }
+}
return -1; // undefined
}
- public final int getSource(int which) {
+ final int getSource(int which) {
if(which<UProperty.BINARY_START) {
return SRC_NONE; /* undefined */
} else if(which<UProperty.BINARY_LIMIT) {
--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+
+package com.ibm.icu.lang;
+
+import com.ibm.icu.impl.CharacterPropertiesImpl;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.CodePointMap;
+import com.ibm.icu.util.CodePointTrie;
+import com.ibm.icu.util.MutableCodePointTrie;
+
+/**
+ * Sets and maps for Unicode properties.
+ * The methods here return an object per property:
+ * A set for each ICU-supported binary property with all code points for which the property is true.
+ * A map for each ICU-supported enumerated/catalog/int-valued property
+ * which maps all Unicode code points to their values for that property.
+ *
+ * <p>For details see the method descriptions.
+ * For lookup of property values by code point see class {@link UCharacter}.
+ *
+ * @draft ICU 63
+ * @provisional This API might change or be removed in a future release.
+ */
+public final class CharacterProperties {
+ private CharacterProperties() {} // all-static
+
+ private static final UnicodeSet sets[] = new UnicodeSet[UProperty.BINARY_LIMIT];
+ private static final CodePointMap maps[] = new CodePointMap[UProperty.INT_LIMIT - UProperty.INT_START];
+
+ private static UnicodeSet makeSet(int property) {
+ UnicodeSet set = new UnicodeSet();
+ UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
+ int numRanges = inclusions.getRangeCount();
+ int startHasProperty = -1;
+
+ for (int i = 0; i < numRanges; ++i) {
+ int rangeEnd = inclusions.getRangeEnd(i);
+ for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
+ if (UCharacter.hasBinaryProperty(c, property)) {
+ if (startHasProperty < 0) {
+ // Transition from false to true.
+ startHasProperty = c;
+ }
+ } else if (startHasProperty >= 0) {
+ // Transition from true to false.
+ set.add(startHasProperty, c - 1);
+ startHasProperty = -1;
+ }
+ }
+ }
+ if (startHasProperty >= 0) {
+ set.add(startHasProperty, 0x10FFFF);
+ }
+
+ return set.freeze();
+ }
+
+ private static CodePointMap makeMap(int property) {
+ int nullValue = property == UProperty.SCRIPT ? UScript.UNKNOWN : 0;
+ MutableCodePointTrie mutableTrie = new MutableCodePointTrie(nullValue, nullValue);
+ UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
+ int numRanges = inclusions.getRangeCount();
+ int start = 0;
+ int value = nullValue;
+
+ for (int i = 0; i < numRanges; ++i) {
+ int rangeEnd = inclusions.getRangeEnd(i);
+ for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) {
+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+ int nextValue = UCharacter.getIntPropertyValue(c, property);
+ if (value != nextValue) {
+ if (value != nullValue) {
+ mutableTrie.setRange(start, c - 1, value);
+ }
+ start = c;
+ value = nextValue;
+ }
+ }
+ }
+ if (value != 0) {
+ mutableTrie.setRange(start, 0x10FFFF, value);
+ }
+
+ CodePointTrie.Type type;
+ if (property == UProperty.BIDI_CLASS || property == UProperty.GENERAL_CATEGORY) {
+ type = CodePointTrie.Type.FAST;
+ } else {
+ type = CodePointTrie.Type.SMALL;
+ }
+ CodePointTrie.ValueWidth valueWidth;
+ // TODO: UCharacterProperty.IntProperty
+ int max = UCharacter.getIntPropertyMaxValue(property);
+ if (max <= 0xff) {
+ valueWidth = CodePointTrie.ValueWidth.BITS_8;
+ } else if (max <= 0xffff) {
+ valueWidth = CodePointTrie.ValueWidth.BITS_16;
+ } else {
+ valueWidth = CodePointTrie.ValueWidth.BITS_32;
+ }
+ return mutableTrie.buildImmutable(type, valueWidth);
+ }
+
+ /**
+ * Returns a frozen UnicodeSet for a binary property.
+ * Throws an exception if the property number is not one for a binary property.
+ *
+ * <p>The returned set contains all code points for which the property is true.
+ *
+ * @param property {@link UProperty#BINARY_START}..{@link UProperty#BINARY_LIMIT}-1
+ * @return the property as a set
+ * @see UProperty
+ * @see UCharacter#hasBinaryProperty
+ */
+ public static final UnicodeSet getBinaryPropertySet(int property) {
+ if (property < 0 || UProperty.BINARY_LIMIT <= property) {
+ throw new IllegalArgumentException("" + property +
+ " is not a constant for a UProperty binary property");
+ }
+ synchronized(sets) {
+ UnicodeSet set = sets[property];
+ if (set == null) {
+ sets[property] = set = makeSet(property);
+ }
+ return set;
+ }
+ }
+
+ /**
+ * Returns an immutable CodePointMap for an enumerated/catalog/int-valued property.
+ * Throws an exception if the property number is not one for an "int property".
+ *
+ * <p>The returned object maps all Unicode code points to their values for that property.
+ * For documentation of the integer values see {@link UCharacter#getIntPropertyValue(int, int)}.
+ *
+ * <p>The actual type of the returned object differs between properties
+ * and may change over time.
+ *
+ * @param property {@link UProperty#INT_START}..{@link UProperty#INT_LIMIT}-1
+ * @return the property as a map
+ * @see UProperty
+ * @see UCharacter#getIntPropertyValue
+ */
+ public static final CodePointMap getIntPropertyMap(int property) {
+ if (property < UProperty.INT_START || UProperty.INT_LIMIT <= property) {
+ throw new IllegalArgumentException("" + property +
+ " is not a constant for a UProperty int property");
+ }
+ synchronized(maps) {
+ CodePointMap map = maps[property - UProperty.INT_START];
+ if (map == null) {
+ maps[property - UProperty.INT_START] = map = makeMap(property);
+ }
+ return map;
+ }
+ }
+}
}
/**
- * {@icu} <p>Check a binary Unicode property for a code point.
+ * {@icu} Check a binary Unicode property for a code point.
* <p>Unicode, especially in version 3.2, defines many more properties
* than the original set in UnicodeData.txt.
* <p>This API is intended to reflect Unicode properties as defined in
* Unicode version does not have data for the property at all, or
* not for this code point.
* @see com.ibm.icu.lang.UProperty
+ * @see CharacterProperties#getBinaryPropertySet(int)
* @stable ICU 2.6
*/
public static boolean hasBinaryProperty(int ch, int property)
}
/**
- * {@icu} <p>Returns the property value for an Unicode property type of a code point.
+ * {@icu} Returns the property value for a Unicode property type of a code point.
* Also returns binary and mask property values.
* <p>Unicode, especially in version 3.2, defines many more properties than
* the original set in UnicodeData.txt.
* UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
* @return numeric value that is directly the property value or,
* for enumerated properties, corresponds to the numeric value of
- * the enumerated constant of the respective property value
- * enumeration type (cast to enum type if necessary).
+ * the enumerated constant of the respective property value type
+ * ({@link ECharacterCategory}, {@link ECharacterDirection},
+ * {@link DecompositionType}, etc.).
* Returns 0 or 1 (for false / true) for binary Unicode properties.
* Returns a bit-mask for mask properties.
* Returns 0 if 'type' is out of bounds or if the Unicode version
* @see #hasBinaryProperty
* @see #getIntPropertyMinValue
* @see #getIntPropertyMaxValue
+ * @see CharacterProperties#getIntPropertyMap(int)
* @see #getUnicodeVersion
* @stable ICU 2.4
*/
import java.util.TreeSet;
import com.ibm.icu.impl.BMPSet;
-import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.CharacterPropertiesImpl;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.impl.RuleCharacterIterator;
import com.ibm.icu.impl.SortedSetRelation;
import com.ibm.icu.impl.StringRange;
-import com.ibm.icu.impl.UBiDiProps;
import com.ibm.icu.impl.UCaseProps;
-import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.UPropertyAliases;
import com.ibm.icu.impl.UnicodeSetStringSpan;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.CharSequences;
+import com.ibm.icu.lang.CharacterProperties;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
+import com.ibm.icu.util.CodePointMap;
import com.ibm.icu.util.Freezable;
import com.ibm.icu.util.ICUUncheckedIOException;
import com.ibm.icu.util.OutputInt;
private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
private static final String ASSIGNED = "Assigned"; // [:^Cn:]
- /**
- * A set of all characters _except_ the second through last characters of
- * certain ranges. These ranges are ranges of characters whose
- * properties are all exactly alike, e.g. CJK Ideographs from
- * U+4E00 to U+9FA5.
- */
- private static UnicodeSet INCLUSIONS[] = null;
-
private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
private volatile UnicodeSetStringSpan stringSpan;
//----------------------------------------------------------------
/**
* Make this object represent the range <code>start - end</code>.
- * If <code>end > start</code> then this object is set to an
- * an empty range.
+ * If <code>end > start</code> then this object is set to an empty range.
*
* @param start first character in the set, inclusive
* @param end last character in the set, inclusive
boolean contains(int codePoint);
}
- private static class NumericValueFilter implements Filter {
+ private static final class NumericValueFilter implements Filter {
double value;
NumericValueFilter(double value) { this.value = value; }
@Override
}
}
- private static class GeneralCategoryMaskFilter implements Filter {
- int mask;
- GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
- @Override
- public boolean contains(int ch) {
- return ((1 << UCharacter.getType(ch)) & mask) != 0;
- }
- }
-
- private static class IntPropertyFilter implements Filter {
- int prop;
- int value;
- IntPropertyFilter(int prop, int value) {
- this.prop = prop;
- this.value = value;
- }
- @Override
- public boolean contains(int ch) {
- return UCharacter.getIntPropertyValue(ch, prop) == value;
- }
- }
-
- private static class ScriptExtensionsFilter implements Filter {
+ private static final class ScriptExtensionsFilter implements Filter {
int script;
ScriptExtensionsFilter(int script) { this.script = script; }
@Override
// VersionInfo for unassigned characters
private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
- private static class VersionFilter implements Filter {
+ private static final class VersionFilter implements Filter {
VersionInfo version;
VersionFilter(VersionInfo version) { this.version = version; }
@Override
}
}
- private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
- }
- if(INCLUSIONS[src] == null) {
- UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_CHAR:
- UCharacterProperty.INSTANCE.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
- UCharacterProperty.INSTANCE.addPropertyStarts(incl);
- UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_CASE_AND_NORM:
- Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
- UCaseProps.INSTANCE.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_NFC:
- Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_NFKC:
- Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_NFKC_CF:
- Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_NFC_CANON_ITER:
- Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_CASE:
- UCaseProps.INSTANCE.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_BIDI:
- UBiDiProps.INSTANCE.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_INPC:
- case UCharacterProperty.SRC_INSC:
- case UCharacterProperty.SRC_VO:
- UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl);
- break;
- default:
- throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
- }
- INCLUSIONS[src] = incl;
- }
- return INCLUSIONS[src];
- }
-
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
- private UnicodeSet applyFilter(Filter filter, int src) {
+ private void applyFilter(Filter filter, UnicodeSet inclusions) {
// Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
// To improve performance, use an inclusions set which
// encodes information about character ranges that are known
// to have identical properties.
- // getInclusions(src) contains exactly the first characters of
- // same-value ranges for the given properties "source".
+ // inclusions contains the first characters of
+ // same-value ranges for the given property.
clear();
int startHasProperty = -1;
- UnicodeSet inclusions = getInclusions(src);
int limitRange = inclusions.getRangeCount();
for (int j=0; j<limitRange; ++j) {
if (startHasProperty >= 0) {
add_unchecked(startHasProperty, 0x10FFFF);
}
+ }
- return this;
+ /** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
+ private static final class GeneralCategoryMaskFilter implements CodePointMap.ValueFilter {
+ int mask;
+ GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
+ @Override
+ public int apply(int value) {
+ value = (1 << value) & mask;
+ if (value != 0) { value = 1; }
+ return value;
+ }
}
+ /** Maps one map value to 1, all others to 0. */
+ private static final class IntValueFilter implements CodePointMap.ValueFilter {
+ int v;
+ IntValueFilter(int value) { v = value; }
+ @Override
+ public int apply(int value) { return value == v ? 1 : 0; }
+ }
+
+ private void applyIntPropertyValue(CodePointMap map, CodePointMap.ValueFilter filter) {
+ clear();
+ CodePointMap.Range range = new CodePointMap.Range();
+ for (int start = 0; map.getRange(start, filter, range);) {
+ int end = range.getEnd();
+ if (range.getValue() != 0) {
+ add_unchecked(start, end);
+ }
+ start = end + 1;
+ }
+ }
/**
* Remove leading and trailing Pattern_White_Space and compress
* @stable ICU 2.4
*/
public UnicodeSet applyIntPropertyValue(int prop, int value) {
- checkFrozen();
+ // All of the following include checkFrozen() before modifying this set.
if (prop == UProperty.GENERAL_CATEGORY_MASK) {
- applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
+ CodePointMap map = CharacterProperties.getIntPropertyMap(UProperty.GENERAL_CATEGORY);
+ applyIntPropertyValue(map, new GeneralCategoryMaskFilter(value));
} else if (prop == UProperty.SCRIPT_EXTENSIONS) {
- applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC);
+ UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
+ applyFilter(new ScriptExtensionsFilter(value), inclusions);
+ } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) {
+ if (value == 0 || value == 1) {
+ set(CharacterProperties.getBinaryPropertySet(prop));
+ if (value == 0) {
+ complement();
+ }
+ } else {
+ clear();
+ }
+ } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) {
+ CodePointMap map = CharacterProperties.getIntPropertyMap(prop);
+ applyIntPropertyValue(map, new IntValueFilter(value));
} else {
- applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop));
+ // This code used to always call getInclusions(property source)
+ // which throws an exception for an unsupported property.
+ throw new IllegalArgumentException("unsupported property " + prop);
+ // Otherwise we would just clear() this set because
+ // getIntPropertyValue(c, prop) returns 0 for all code points.
}
return this;
}
case UProperty.NUMERIC_VALUE:
{
double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
- applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
+ applyFilter(new NumericValueFilter(value),
+ CharacterPropertiesImpl.getInclusionsForProperty(p));
return this;
}
case UProperty.NAME:
// VersionInfo.getInstance() does not do
// 'loose' matching.
VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
- applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
+ applyFilter(new VersionFilter(version),
+ CharacterPropertiesImpl.getInclusionsForProperty(p));
return this;
}
case UProperty.SCRIPT_EXTENSIONS:
* of UnicodeSets.
* <p>
* WARNING: If this function is used with a UnicodeProperty, and the
- * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call
+ * Unassigned characters (gc=Cn) are different than in ICU, you MUST call
* {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
* with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
*
*/
@Deprecated
public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
- INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
+ // If the properties override inclusions, these have to be regenerated.
+ // TODO: Check if the Unicode Tools or Unicode Utilities really need this.
+ CharacterPropertiesImpl.clear();
XSYMBOL_TABLE = xSymbolTable;
}
}
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.impl.UCharacterName;
import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.CharacterProperties;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UCharacterDirection;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.CodePointMap;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.ValueIterator;
int output = UCharacter.getCharFromNameAlias(alias);
assertEquals("alias for '" + input + "'", input, output);
}
+
+ @Test
+ public void TestBinaryCharacterProperties() {
+ try {
+ CharacterProperties.getBinaryPropertySet(-1);
+ fail("getBinaryPropertySet(-1) did not throw an exception");
+ CharacterProperties.getBinaryPropertySet(UProperty.BINARY_LIMIT);
+ fail("getBinaryPropertySet(BINARY_LIMIT) did not throw an exception");
+ } catch(Exception expected) {
+ }
+ // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
+ for (int prop = 0; prop < UProperty.BINARY_LIMIT; ++prop) {
+ UnicodeSet set = CharacterProperties.getBinaryPropertySet(prop);
+ int size = set.size();
+ if (size == 0) {
+ assertFalse("!hasBinaryProperty(U+0020, " + prop + ')',
+ UCharacter.hasBinaryProperty(0x20, prop));
+ assertFalse("!hasBinaryProperty(U+0061, " + prop + ')',
+ UCharacter.hasBinaryProperty(0x61, prop));
+ assertFalse("!hasBinaryProperty(U+4E00, " + prop + ')',
+ UCharacter.hasBinaryProperty(0x4e00, prop));
+ } else {
+ int c = set.charAt(0);
+ if (c > 0) {
+ assertFalse("!hasBinaryProperty(" + Utility.hex(c - 1) + ", " + prop + ')',
+ UCharacter.hasBinaryProperty(c - 1, prop));
+ }
+ assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
+ UCharacter.hasBinaryProperty(c, prop));
+ c = set.charAt(size - 1);
+ assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
+ UCharacter.hasBinaryProperty(c, prop));
+ if (c < 0x10ffff) {
+ assertFalse("!hasBinaryProperty(" + Utility.hex(c + 1) + ", " + prop + ')',
+ UCharacter.hasBinaryProperty(c + 1, prop));
+ }
+ }
+ }
+ }
+
+ @Test
+ public void TestIntCharacterProperties() {
+ try {
+ CharacterProperties.getIntPropertyMap(UProperty.INT_START - 1);
+ fail("getIntPropertyMap(INT_START-1) did not throw an exception");
+ CharacterProperties.getIntPropertyMap(UProperty.INT_LIMIT);
+ fail("getIntPropertyMap(INT_LIMIT) did not throw an exception");
+ } catch(Exception expected) {
+ }
+ // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
+ CodePointMap.Range range = new CodePointMap.Range();
+ for (int prop = UProperty.INT_START; prop < UProperty.INT_LIMIT; ++prop) {
+ CodePointMap map = CharacterProperties.getIntPropertyMap(prop);
+ assertTrue("int property first range", map.getRange(0, null, range));
+ int c = (range.getStart() + range.getEnd()) / 2;
+ assertEquals("int property first range value at " + Utility.hex(c),
+ UCharacter.getIntPropertyValue(c, prop), range.getValue());
+ assertTrue("int property later range", map.getRange(0x5000, null, range));
+ int end = range.getEnd();
+ assertEquals("int property later range value at " + Utility.hex(end),
+ UCharacter.getIntPropertyValue(end, prop), range.getValue());
+ }
+ }
}