ICU-20086 C++ sets & maps for Unicode properties (#93)

author Markus Scherer <markus.icu@gmail.com>

Fri, 7 Sep 2018 20:40:19 +0000 (13:40 -0700)

committer Shane Carr <shane@unicode.org>

Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
author Markus Scherer <markus.icu@gmail.com>
Fri, 7 Sep 2018 20:40:19 +0000 (13:40 -0700)
committer Shane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in

index bb63e5bde70b2c2249cebe6545ee2da1496877cc..e10d3a27d3ac800c55dc00854d6b10da8680264f 100644 (file)
--- a/icu4c/source/common/Makefile.in
+++ b/icu4c/source/common/Makefile.in
@@ -100,7 +100,8 @@ utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.
  unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
  normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o loadednormalizer2impl.o \
  chariter.o schriter.o uchriter.o uiter.o \
-patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
+patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o characterproperties.o \
+ubidi.o ubidiwrt.o ubidiln.o ushape.o \
  uscript.o uscript_props.o usc_impl.o unames.o \
  utrie.o utrie2.o utrie2_builder.o ucptrie.o umutablecptrie.o \
  bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
diff --git a/icu4c/source/common/characterproperties.cpp b/icu4c/source/common/characterproperties.cpp

new file mode 100644 (file)

index 0000000..5336748
--- /dev/null
+++ b/icu4c/source/common/characterproperties.cpp
@@ -0,0 +1,340 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// characterproperties.cpp
+// created: 2018sep03 Markus W. Scherer
+
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/uchar.h"
+#include "unicode/ucpmap.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uniset.h"
+#include "unicode/uscript.h"
+#include "unicode/uset.h"
+#include "cmemory.h"
+#include "mutex.h"
+#include "normalizer2impl.h"
+#include "uassert.h"
+#include "ubidi_props.h"
+#include "ucase.h"
+#include "ucln_cmn.h"
+#include "umutex.h"
+#include "uprops.h"
+
+using icu::UInitOnce;
+using icu::UnicodeSet;
+
+namespace {
+
+U_CDECL_BEGIN
+
+UBool U_CALLCONV characterproperties_cleanup();
+
+struct Inclusion {
+    UnicodeSet  *fSet;
+    UInitOnce    fInitOnce;
+};
+Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
+
+UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
+
+UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
+
+UMutex cpMutex = U_MUTEX_INITIALIZER;
+
+//----------------------------------------------------------------
+// Inclusions list
+//----------------------------------------------------------------
+
+// USetAdder implementation
+// Does not use uset.h to reduce code dependencies
+void U_CALLCONV
+_set_add(USet *set, UChar32 c) {
+    ((UnicodeSet *)set)->add(c);
+}
+
+void U_CALLCONV
+_set_addRange(USet *set, UChar32 start, UChar32 end) {
+    ((UnicodeSet *)set)->add(start, end);
+}
+
+void U_CALLCONV
+_set_addString(USet *set, const UChar *str, int32_t length) {
+    ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
+}
+
+UBool U_CALLCONV characterproperties_cleanup() {
+    for (Inclusion &in: gInclusions) {
+        delete in.fSet;
+        in.fSet = nullptr;
+        in.fInitOnce.reset();
+    }
+    for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
+        delete sets[i];
+        sets[i] = nullptr;
+    }
+    for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
+        ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
+        maps[i] = nullptr;
+    }
+    return TRUE;
+}
+
+U_CDECL_END
+
+}  // namespace
+
+U_NAMESPACE_BEGIN
+
+/*
+Reduce excessive reallocation, and make it easier to detect initialization problems.
+Usually you don't see smaller sets than this for Unicode 5.0.
+*/
+constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072;
+
+void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) {
+    // This function is invoked only via umtx_initOnce().
+    // This function is a friend of class UnicodeSet.
+
+    U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
+    if (src == UPROPS_SRC_NONE) {
+        errorCode = U_INTERNAL_PROGRAM_ERROR;
+        return;
+    }
+    UnicodeSet * &incl = gInclusions[src].fSet;
+    U_ASSERT(incl == nullptr);
+
+    incl = new UnicodeSet();
+    if (incl == nullptr) {
+        errorCode = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    USetAdder sa = {
+        (USet *)incl,
+        _set_add,
+        _set_addRange,
+        _set_addString,
+        nullptr, // don't need remove()
+        nullptr // don't need removeRange()
+    };
+
+    incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode);
+    switch(src) {
+    case UPROPS_SRC_CHAR:
+        uchar_addPropertyStarts(&sa, &errorCode);
+        break;
+    case UPROPS_SRC_PROPSVEC:
+        upropsvec_addPropertyStarts(&sa, &errorCode);
+        break;
+    case UPROPS_SRC_CHAR_AND_PROPSVEC:
+        uchar_addPropertyStarts(&sa, &errorCode);
+        upropsvec_addPropertyStarts(&sa, &errorCode);
+        break;
+#if !UCONFIG_NO_NORMALIZATION
+    case UPROPS_SRC_CASE_AND_NORM: {
+        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+        if(U_SUCCESS(errorCode)) {
+            impl->addPropertyStarts(&sa, errorCode);
+        }
+        ucase_addPropertyStarts(&sa, &errorCode);
+        break;
+    }
+    case UPROPS_SRC_NFC: {
+        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+        if(U_SUCCESS(errorCode)) {
+            impl->addPropertyStarts(&sa, errorCode);
+        }
+        break;
+    }
+    case UPROPS_SRC_NFKC: {
+        const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
+        if(U_SUCCESS(errorCode)) {
+            impl->addPropertyStarts(&sa, errorCode);
+        }
+        break;
+    }
+    case UPROPS_SRC_NFKC_CF: {
+        const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
+        if(U_SUCCESS(errorCode)) {
+            impl->addPropertyStarts(&sa, errorCode);
+        }
+        break;
+    }
+    case UPROPS_SRC_NFC_CANON_ITER: {
+        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
+        if(U_SUCCESS(errorCode)) {
+            impl->addCanonIterPropertyStarts(&sa, errorCode);
+        }
+        break;
+    }
+#endif
+    case UPROPS_SRC_CASE:
+        ucase_addPropertyStarts(&sa, &errorCode);
+        break;
+    case UPROPS_SRC_BIDI:
+        ubidi_addPropertyStarts(&sa, &errorCode);
+        break;
+    case UPROPS_SRC_INPC:
+    case UPROPS_SRC_INSC:
+    case UPROPS_SRC_VO:
+        uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
+        break;
+    default:
+        errorCode = U_INTERNAL_PROGRAM_ERROR;
+        break;
+    }
+
+    if (U_FAILURE(errorCode)) {
+        delete incl;
+        incl = nullptr;
+        return;
+    }
+    // Compact for caching
+    incl->compact();
+    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
+}
+
+const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    if (src < 0 || UPROPS_SRC_COUNT <= src) {
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    Inclusion &i = gInclusions[src];
+    umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode);
+    return i.fSet;
+}
+
+const UnicodeSet *CharacterProperties::getInclusionsForProperty(
+        UProperty prop, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    UPropertySource src = uprops_getSource(prop);
+    return getInclusionsForSource(src, errorCode);
+}
+
+U_NAMESPACE_END
+
+namespace {
+
+UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    icu::LocalPointer<UnicodeSet> set(new UnicodeSet());
+    if (set.isNull()) {
+        errorCode = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
+    }
+    const UnicodeSet *inclusions =
+        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    int32_t numRanges = inclusions->getRangeCount();
+    UChar32 startHasProperty = -1;
+
+    for (int32_t i = 0; i < numRanges; ++i) {
+        UChar32 rangeEnd = inclusions->getRangeEnd(i);
+        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+            // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
+            if (u_hasBinaryProperty(c, property)) {
+                if (startHasProperty < 0) {
+                    // Transition from false to true.
+                    startHasProperty = c;
+                }
+            } else if (startHasProperty >= 0) {
+                // Transition from true to false.
+                set->add(startHasProperty, c - 1);
+                startHasProperty = -1;
+            }
+        }
+    }
+    if (startHasProperty >= 0) {
+        set->add(startHasProperty, 0x10FFFF);
+    }
+    set->freeze();
+    return set.orphan();
+}
+
+UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
+    icu::LocalUMutableCPTriePointer mutableTrie(
+        umutablecptrie_open(nullValue, nullValue, &errorCode));
+    const UnicodeSet *inclusions =
+        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
+    if (U_FAILURE(errorCode)) { return nullptr; }
+    int32_t numRanges = inclusions->getRangeCount();
+    UChar32 start = 0;
+    uint32_t value = nullValue;
+
+    for (int32_t i = 0; i < numRanges; ++i) {
+        UChar32 rangeEnd = inclusions->getRangeEnd(i);
+        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
+            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+            uint32_t nextValue = u_getIntPropertyValue(c, property);
+            if (value != nextValue) {
+                if (value != nullValue) {
+                    umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
+                }
+                start = c;
+                value = nextValue;
+            }
+        }
+    }
+    if (value != 0) {
+        umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
+    }
+
+    UCPTrieType type;
+    if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
+        type = UCPTRIE_TYPE_FAST;
+    } else {
+        type = UCPTRIE_TYPE_SMALL;
+    }
+    UCPTrieValueWidth valueWidth;
+    // TODO: UCharacterProperty.IntProperty
+    int32_t max = u_getIntPropertyMaxValue(property);
+    if (max <= 0xff) {
+        valueWidth = UCPTRIE_VALUE_BITS_8;
+    } else if (max <= 0xffff) {
+        valueWidth = UCPTRIE_VALUE_BITS_16;
+    } else {
+        valueWidth = UCPTRIE_VALUE_BITS_32;
+    }
+    return reinterpret_cast<UCPMap *>(
+        umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
+}
+
+}  // namespace
+
+U_NAMESPACE_USE
+
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
+    if (U_FAILURE(*pErrorCode)) { return nullptr; }
+    if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    Mutex m(&cpMutex);
+    UnicodeSet *set = sets[property];
+    if (set == nullptr) {
+        sets[property] = set = makeSet(property, *pErrorCode);
+    }
+    if (U_FAILURE(*pErrorCode)) { return nullptr; }
+    return set->toUSet();
+}
+
+U_CAPI const UCPMap * U_EXPORT2
+u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
+    if (U_FAILURE(*pErrorCode)) { return nullptr; }
+    if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    Mutex m(&cpMutex);
+    UCPMap *map = maps[property - UCHAR_INT_START];
+    if (map == nullptr) {
+        maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
+    }
+    return map;
+}
diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj

index 9d4f1e94988d581c09bf84109972fc897b9d2091..2495a89563928bff60a9c397efbc04b2e1b65303 100644 (file)
--- a/icu4c/source/common/common.vcxproj
+++ b/icu4c/source/common/common.vcxproj
@@ -268,6 +268,7 @@
      <ClCompile Include="ruleiter.cpp" />
      <ClCompile Include="ucase.cpp" />
      <ClCompile Include="uchar.cpp" />
+    <ClCompile Include="characterproperties.cpp" />
      <ClCompile Include="unames.cpp" />
      <ClCompile Include="unifiedcache.cpp" />
      <ClCompile Include="unifilt.cpp" />
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters

index 52034549f1f6af00e3e16ac113f1a918891d30d9..99ed94fbd7056eeee5c4401a93de3865cbcaa1a9 100644 (file)
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@@ -388,6 +388,9 @@
      <ClCompile Include="bmpset.cpp">
        <Filter>properties &amp; sets</Filter>
      </ClCompile>
+    <ClCompile Include="characterproperties.cpp">
+      <Filter>properties &amp; sets</Filter>
+    </ClCompile>
      <ClCompile Include="propname.cpp">
        <Filter>properties &amp; sets</Filter>
      </ClCompile>
diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj

index b9207e1ebdf83b8074f665a13adfaf812b9b345c..a3801840669460741fd7e04ede8fc7ac8935a2d9 100644 (file)
--- a/icu4c/source/common/common_uwp.vcxproj
+++ b/icu4c/source/common/common_uwp.vcxproj
@@ -393,6 +393,7 @@
      <ClCompile Include="ruleiter.cpp" />
      <ClCompile Include="ucase.cpp" />
      <ClCompile Include="uchar.cpp" />
+    <ClCompile Include="characterproperties.cpp" />
      <ClCompile Include="unames.cpp" />
      <ClCompile Include="unifiedcache.cpp" />
      <ClCompile Include="unifilt.cpp" />
diff --git a/icu4c/source/common/mutex.h b/icu4c/source/common/mutex.h

index bb45e7df83cf93b79735212414b7274946d4b656..47f5e080f820a5534cfd6555d726674098e6094b 100644 (file)
--- a/icu4c/source/common/mutex.h
+++ b/icu4c/source/common/mutex.h
@@ -34,9 +34,9 @@ U_NAMESPACE_BEGIN
  // private mutex where possible.
  
  // For example:
-// 
-// UMutex myMutex;
-// 
+//
+// UMutex myMutex = U_MUTEX_INITIALIZER;
+//
  // void Function(int arg1, int arg2)
  // {
  //    static Object* foo;     // Shared read-write object
diff --git a/icu4c/source/common/normalizer2impl.cpp b/icu4c/source/common/normalizer2impl.cpp

index 6816ddc853ab5500964458776de9ffb3b0850d97..e7ae646c41ae6d6192bce599a9c036d71f1d7f17 100644 (file)
--- a/icu4c/source/common/normalizer2impl.cpp
+++ b/icu4c/source/common/normalizer2impl.cpp
@@ -466,7 +466,7 @@ void
  Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
      UChar32 start = 0, end;
      uint32_t norm16;
-    while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT,
+    while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
                                     nullptr, nullptr, &norm16)) >= 0) {
          if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
                  norm16 != Normalizer2Impl::JAMO_VT) {
@@ -484,7 +484,7 @@ Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode
      // Add the start code point of each same-value range of the trie.
      UChar32 start = 0, end;
      uint32_t value;
-    while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT,
+    while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
                                     nullptr, nullptr, &value)) >= 0) {
          sa->add(sa->set, start);
          if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
@@ -518,7 +518,7 @@ Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &err
      // Currently only used for the SEGMENT_STARTER property.
      UChar32 start = 0, end;
      uint32_t value;
-    while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPTRIE_RANGE_NORMAL, 0,
+    while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
                                     segmentStarterMapper, nullptr, &value)) >= 0) {
          sa->add(sa->set, start);
          start = end + 1;
@@ -2398,7 +2398,7 @@ void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
          UChar32 start = 0, end;
          uint32_t value;
          while ((end = ucptrie_getRange(impl->normTrie, start,
-                                       UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
+                                       UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
                                         nullptr, nullptr, &value)) >= 0) {
              // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
              if (value != Normalizer2Impl::INERT) {
diff --git a/icu4c/source/common/normalizer2impl.h b/icu4c/source/common/normalizer2impl.h

index 2231110bbc503ba6ed02b7956297d2443f45f9af..2e6aff308819c51bbf249f504cf467066026a8ce 100644 (file)
--- a/icu4c/source/common/normalizer2impl.h
+++ b/icu4c/source/common/normalizer2impl.h
@@ -30,6 +30,7 @@
  #include "unicode/utf.h"
  #include "unicode/utf16.h"
  #include "mutex.h"
+#include "udataswp.h"
  #include "uset_imp.h"
  
  // When the nfc.nrm data is *not* hardcoded into the common library
diff --git a/icu4c/source/common/ucln_cmn.h b/icu4c/source/common/ucln_cmn.h

index d1971b998d9f1883e8afcd57d7eb0f2f132a95d8..0ca911b47d9875a7dcabadd0dad6b6d8ab611fcd 100644 (file)
--- a/icu4c/source/common/ucln_cmn.h
+++ b/icu4c/source/common/ucln_cmn.h
@@ -45,6 +45,7 @@ typedef enum ECleanupCommonType {
      UCLN_COMMON_CURRENCY,
      UCLN_COMMON_LOADED_NORMALIZER2,
      UCLN_COMMON_NORMALIZER2,
+    UCLN_COMMON_CHARACTERPROPERTIES,
      UCLN_COMMON_USET,
      UCLN_COMMON_UNAMES,
      UCLN_COMMON_UPROPS,
diff --git a/icu4c/source/common/ucptrie.cpp b/icu4c/source/common/ucptrie.cpp

index 09ac38a705be76a3d9ea49ca4ad3bd840bb76d3f..13496ad56c5e586de497233a3a5b3b6eb2b71240 100644 (file)
--- a/icu4c/source/common/ucptrie.cpp
+++ b/icu4c/source/common/ucptrie.cpp
@@ -247,7 +247,7 @@ namespace {
  constexpr int32_t MAX_UNICODE = 0x10ffff;
  
  inline uint32_t maybeFilterValue(uint32_t value, uint32_t trieNullValue, uint32_t nullValue,
-                                 UCPTrieValueFilter *filter, const void *context) {
+                                 UCPMapValueFilter *filter, const void *context) {
      if (value == trieNullValue) {
          value = nullValue;
      } else if (filter != nullptr) {
@@ -257,7 +257,7 @@ inline uint32_t maybeFilterValue(uint32_t value, uint32_t trieNullValue, uint32_
  }
  
  UChar32 getRange(const void *t, UChar32 start,
-                 UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+                 UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
      if ((uint32_t)start > MAX_UNICODE) {
          return U_SENTINEL;
      }
@@ -403,9 +403,9 @@ UChar32 getRange(const void *t, UChar32 start,
  U_CFUNC UChar32
  ucptrie_internalGetRange(UCPTrieGetRange *getRange,
                           const void *trie, UChar32 start,
-                         UCPTrieRangeOption option, uint32_t surrogateValue,
-                         UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
-    if (option == UCPTRIE_RANGE_NORMAL) {
+                         UCPMapRangeOption option, uint32_t surrogateValue,
+                         UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+    if (option == UCPMAP_RANGE_NORMAL) {
          return getRange(trie, start, filter, context, pValue);
      }
      uint32_t value;
@@ -413,7 +413,7 @@ ucptrie_internalGetRange(UCPTrieGetRange *getRange,
          // We need to examine the range value even if the caller does not want it.
          pValue = &value;
      }
-    UChar32 surrEnd = option == UCPTRIE_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
+    UChar32 surrEnd = option == UCPMAP_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
      UChar32 end = getRange(trie, start, filter, context, pValue);
      if (end < 0xd7ff || start > surrEnd) {
          return end;
@@ -448,8 +448,8 @@ ucptrie_internalGetRange(UCPTrieGetRange *getRange,
  
  U_CAPI UChar32 U_EXPORT2
  ucptrie_getRange(const UCPTrie *trie, UChar32 start,
-                 UCPTrieRangeOption option, uint32_t surrogateValue,
-                 UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+                 UCPMapRangeOption option, uint32_t surrogateValue,
+                 UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
      return ucptrie_internalGetRange(getRange, trie, start,
                                      option, surrogateValue,
                                      filter, context, pValue);
@@ -571,3 +571,20 @@ ucptrie_printLengths(const UCPTrie *trie, const char *which) {
  #endif
  
  }  // namespace
+
+// UCPMap ----
+// Initially, this is the same as UCPTrie. This may well change.
+
+U_CAPI uint32_t U_EXPORT2
+ucpmap_get(const UCPMap *map, UChar32 c) {
+    return ucptrie_get(reinterpret_cast<const UCPTrie *>(map), c);
+}
+
+U_CAPI UChar32 U_EXPORT2
+ucpmap_getRange(const UCPMap *map, UChar32 start,
+                UCPMapRangeOption option, uint32_t surrogateValue,
+                UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+    return ucptrie_getRange(reinterpret_cast<const UCPTrie *>(map), start,
+                            option, surrogateValue,
+                            filter, context, pValue);
+}
diff --git a/icu4c/source/common/ucptrie_impl.h b/icu4c/source/common/ucptrie_impl.h

index 8202628afafbf9d2ae52a8bf9ed6bf126f7a6cfc..1fe6a18ac5319e97ea2e3ff11342b5ca4d7ee2ed 100644 (file)
--- a/icu4c/source/common/ucptrie_impl.h
+++ b/icu4c/source/common/ucptrie_impl.h
@@ -131,13 +131,13 @@ enum {
  
  typedef UChar32
  UCPTrieGetRange(const void *trie, UChar32 start,
-                UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+                UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  
  U_CFUNC UChar32
  ucptrie_internalGetRange(UCPTrieGetRange *getRange,
                           const void *trie, UChar32 start,
-                         UCPTrieRangeOption option, uint32_t surrogateValue,
-                         UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+                         UCPMapRangeOption option, uint32_t surrogateValue,
+                         UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  
  #ifdef UCPTRIE_DEBUG
  U_CFUNC void
diff --git a/icu4c/source/common/umutablecptrie.cpp b/icu4c/source/common/umutablecptrie.cpp

index f23b5e19261479d09dcb6c5be51beba7ab72b95a..40af4b6c16a163a8ab7f777403d873ad9f38c4a7 100644 (file)
--- a/icu4c/source/common/umutablecptrie.cpp
+++ b/icu4c/source/common/umutablecptrie.cpp
@@ -70,10 +70,11 @@ public:
  
      MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
  
+    static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
      static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
  
      uint32_t get(UChar32 c) const;
-    int32_t getRange(UChar32 start, UCPTrieValueFilter *filter, const void *context,
+    int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
                       uint32_t *pValue) const;
  
      void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
@@ -171,6 +172,36 @@ MutableCodePointTrie::~MutableCodePointTrie() {
      uprv_free(index16);
  }
  
+MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
+    // Use the highValue as the initialValue to reduce the highStart.
+    uint32_t errorValue = ucpmap_get(map, -1);
+    uint32_t initialValue = ucpmap_get(map, 0x10ffff);
+    LocalPointer<MutableCodePointTrie> mutableTrie(
+        new MutableCodePointTrie(initialValue, errorValue, errorCode),
+        errorCode);
+    if (U_FAILURE(errorCode)) {
+        return nullptr;
+    }
+    UChar32 start = 0, end;
+    uint32_t value;
+    while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+                                  nullptr, nullptr, &value)) >= 0) {
+        if (value != initialValue) {
+            if (start == end) {
+                mutableTrie->set(start, value, errorCode);
+            } else {
+                mutableTrie->setRange(start, end, value, errorCode);
+            }
+        }
+        start = end + 1;
+    }
+    if (U_SUCCESS(errorCode)) {
+        return mutableTrie.orphan();
+    } else {
+        return nullptr;
+    }
+}
+
  MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
      // Use the highValue as the initialValue to reduce the highStart.
      uint32_t errorValue;
@@ -201,7 +232,7 @@ MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UEr
      }
      UChar32 start = 0, end;
      uint32_t value;
-    while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+    while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
                                     nullptr, nullptr, &value)) >= 0) {
          if (value != initialValue) {
              if (start == end) {
@@ -244,7 +275,7 @@ uint32_t MutableCodePointTrie::get(UChar32 c) const {
  }
  
  inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
-                                 UCPTrieValueFilter *filter, const void *context) {
+                                 UCPMapValueFilter *filter, const void *context) {
      if (value == initialValue) {
          value = nullValue;
      } else if (filter != nullptr) {
@@ -254,7 +285,7 @@ inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t
  }
  
  UChar32 MutableCodePointTrie::getRange(
-        UChar32 start, UCPTrieValueFilter *filter, const void *context,
+        UChar32 start, UCPMapValueFilter *filter, const void *context,
          uint32_t *pValue) const {
      if ((uint32_t)start > MAX_UNICODE) {
          return U_SENTINEL;
@@ -1565,6 +1596,18 @@ umutablecptrie_close(UMutableCPTrie *trie) {
      delete reinterpret_cast<MutableCodePointTrie *>(trie);
  }
  
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
+    if (U_FAILURE(*pErrorCode)) {
+        return nullptr;
+    }
+    if (map == nullptr) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+    return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
+}
+
  U_CAPI UMutableCPTrie * U_EXPORT2
  umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
      if (U_FAILURE(*pErrorCode)) {
@@ -1585,7 +1628,7 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
  namespace {
  
  UChar32 getRange(const void *trie, UChar32 start,
-                 UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+                 UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
      return reinterpret_cast<const MutableCodePointTrie *>(trie)->
          getRange(start, filter, context, pValue);
  }
@@ -1594,8 +1637,8 @@ UChar32 getRange(const void *trie, UChar32 start,
  
  U_CAPI UChar32 U_EXPORT2
  umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
-                        UCPTrieRangeOption option, uint32_t surrogateValue,
-                        UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) {
+                        UCPMapRangeOption option, uint32_t surrogateValue,
+                        UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
      return ucptrie_internalGetRange(getRange, trie, start,
                                      option, surrogateValue,
                                      filter, context, pValue);
diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h

index 29ec68fe7e1bc0b6d0fb99ee05681982db24b9ff..d9342626bc5a0c0e15f41fb57c8be681f3b7bfe2 100644 (file)
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@@ -27,6 +27,24 @@
  
  #include "unicode/utypes.h"
  #include "unicode/stringoptions.h"
+#include "unicode/ucpmap.h"
+
+#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN)
+
+#define USET_DEFINED
+
+/**
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * It is forward-declared here to avoid including unicode/uset.h file if related
+ * APIs are not used.
+ *
+ * @see ucnv_getUnicodeSet
+ * @stable ICU 2.4
+ */
+typedef struct USet USet;
+
+#endif
+
  
  U_CDECL_BEGIN
  
@@ -61,6 +79,18 @@ U_CDECL_BEGIN
   * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
   * and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html).
   *
+ * Many properties are accessible via generic functions that take a UProperty selector.
+ * - u_hasBinaryProperty() returns a binary value (TRUE/FALSE) per property and code point.
+ * - u_getIntPropertyValue() returns an integer value per property and code point.
+ *   For each supported enumerated or catalog property, there is
+ *   an enum type for all of the property's values, and
+ *   u_getIntPropertyValue() returns the numeric values of those constants.
+ * - u_getBinaryPropertySet() returns a set for each ICU-supported binary property with
+ *   all code points for which the property is true.
+ * - u_getIntPropertyMap() returns a map for each
+ *   ICU-supported enumerated/catalog/int-valued property which
+ *   maps all Unicode code points to their values for that property.
+ *
   * Many functions are designed to match java.lang.Character functions.
   * See the individual function documentation,
   * and see the JDK 1.4 java.lang.Character documentation
@@ -2519,6 +2549,7 @@ typedef enum UVerticalOrientation {
   *         does not have data for the property at all, or not for this code point.
   *
   * @see UProperty
+ * @see u_getBinaryPropertySet
   * @see u_getIntPropertyValue
   * @see u_getUnicodeVersion
   * @stable ICU 2.1
@@ -2526,6 +2557,27 @@ typedef enum UVerticalOrientation {
  U_STABLE UBool U_EXPORT2
  u_hasBinaryProperty(UChar32 c, UProperty which);
  
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Returns a frozen USet for a binary property.
+ * The library retains ownership over the returned object.
+ * Sets an error code if the property number is not one for a binary property.
+ *
+ * The returned set contains all code points for which the property is true.
+ *
+ * @param property UCHAR_BINARY_START..UCHAR_BINARY_LIMIT-1
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the property as a set
+ * @see UProperty
+ * @see u_hasBinaryProperty
+ * @see Unicode::fromUSet
+ */
+U_CAPI const USet * U_EXPORT2
+u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode);
+
+#endif  // U_HIDE_DRAFT_API
+
  /**
   * Check if a code point has the Alphabetic Unicode property.
   * Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC).
@@ -2626,6 +2678,7 @@ u_isUWhiteSpace(UChar32 c);
   * @see u_hasBinaryProperty
   * @see u_getIntPropertyMinValue
   * @see u_getIntPropertyMaxValue
+ * @see u_getIntPropertyMap
   * @see u_getUnicodeVersion
   * @stable ICU 2.2
   */
@@ -2682,6 +2735,27 @@ u_getIntPropertyMinValue(UProperty which);
  U_STABLE int32_t U_EXPORT2
  u_getIntPropertyMaxValue(UProperty which);
  
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Returns an immutable UCPMap for an enumerated/catalog/int-valued property.
+ * The library retains ownership over the returned object.
+ * Sets an error code if the property number is not one for an "int property".
+ *
+ * The returned object maps all Unicode code points to their values for that property.
+ * For documentation of the integer values see u_getIntPropertyValue().
+ *
+ * @param property UCHAR_INT_START..UCHAR_INT_LIMIT-1
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the property as a map
+ * @see UProperty
+ * @see u_getIntPropertyValue
+ */
+U_CAPI const UCPMap * U_EXPORT2
+u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode);
+
+#endif  // U_HIDE_DRAFT_API
+
  /**
   * Get the numeric value for a Unicode code point as defined in the
   * Unicode Character Database.
diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h

index 53b4c6f0733aca65f5c6e121d01a8021c63b9038..ec7c5f350b497378298874dd869a837137964352 100644 (file)
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@@ -53,19 +53,18 @@
  #include "unicode/uenum.h"
  #include "unicode/localpointer.h"
  
-#ifndef __USET_H__
+#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN)
+
+#define USET_DEFINED
  
  /**
- * USet is the C API type for Unicode sets.
- * It is forward-declared here to avoid including the header file if related
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * It is forward-declared here to avoid including unicode/uset.h file if related
   * conversion APIs are not used.
- * See unicode/uset.h
   *
   * @see ucnv_getUnicodeSet
- * @stable ICU 2.6
+ * @stable ICU 2.4
   */
-struct USet;
-/** @stable ICU 2.6 */
  typedef struct USet USet;
  
  #endif
diff --git a/icu4c/source/common/unicode/ucpmap.h b/icu4c/source/common/unicode/ucpmap.h

new file mode 100644 (file)

index 0000000..58fed20
--- /dev/null
+++ b/icu4c/source/common/unicode/ucpmap.h
@@ -0,0 +1,159 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// ucpmap.h
+// created: 2018sep03 Markus W. Scherer
+
+#ifndef __UCPMAP_H__
+#define __UCPMAP_H__
+
+#include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
+U_CDECL_BEGIN
+
+/**
+ * \file
+ *
+ * This file defines an abstract map from Unicode code points to integer values.
+ *
+ * @see UCPMap
+ * @see UCPTrie
+ * @see UMutableCPTrie
+ */
+
+/**
+ * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
+ *
+ * @see UCPTrie
+ * @see UMutableCPTrie
+ * @draft ICU 63
+ */
+typedef struct UCPMap UCPMap;
+
+/**
+ * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates.
+ * Most users should use UCPMAP_RANGE_NORMAL.
+ *
+ * @see ucpmap_getRange
+ * @see ucptrie_getRange
+ * @see umutablecptrie_getRange
+ * @draft ICU 63
+ */
+enum UCPMapRangeOption {
+    /**
+     * ucpmap_getRange() enumerates all same-value ranges as stored in the map.
+     * Most users should use this option.
+     */
+    UCPMAP_RANGE_NORMAL,
+    /**
+     * ucpmap_getRange() enumerates all same-value ranges as stored in the map,
+     * except that lead surrogates (U+D800..U+DBFF) are treated as having the
+     * surrogateValue, which is passed to getRange() as a separate parameter.
+     * The surrogateValue is not transformed via filter().
+     * See U_IS_LEAD(c).
+     *
+     * Most users should use UCPMAP_RANGE_NORMAL instead.
+     *
+     * This option is useful for maps that map surrogate code *units* to
+     * special values optimized for UTF-16 string processing
+     * or for special error behavior for unpaired surrogates,
+     * but those values are not to be associated with the lead surrogate code *points*.
+     */
+    UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
+    /**
+     * ucpmap_getRange() enumerates all same-value ranges as stored in the map,
+     * except that all surrogates (U+D800..U+DFFF) are treated as having the
+     * surrogateValue, which is passed to getRange() as a separate parameter.
+     * The surrogateValue is not transformed via filter().
+     * See U_IS_SURROGATE(c).
+     *
+     * Most users should use UCPMAP_RANGE_NORMAL instead.
+     *
+     * This option is useful for maps that map surrogate code *units* to
+     * special values optimized for UTF-16 string processing
+     * or for special error behavior for unpaired surrogates,
+     * but those values are not to be associated with the lead surrogate code *points*.
+     */
+    UCPMAP_RANGE_FIXED_ALL_SURROGATES
+};
+#ifndef U_IN_DOXYGEN
+typedef enum UCPMapRangeOption UCPMapRangeOption;
+#endif
+
+/**
+ * Returns the value for a code point as stored in the map, with range checking.
+ * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
+ *
+ * @param map the map
+ * @param c the code point
+ * @return the map value,
+ *         or an implementation-defined error value if the code point is not in the range 0..U+10FFFF
+ * @draft ICU 63
+ */
+U_CAPI uint32_t U_EXPORT2
+ucpmap_get(const UCPMap *map, UChar32 c);
+
+/**
+ * Callback function type: Modifies a map value.
+ * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange().
+ * The modified value will be returned by the getRange function.
+ *
+ * Can be used to ignore some of the value bits,
+ * make a filter for one of several values,
+ * return a value index computed from the map value, etc.
+ *
+ * @param context an opaque pointer, as passed into the getRange function
+ * @param value a value from the map
+ * @return the modified value
+ * @draft ICU 63
+ */
+typedef uint32_t U_CALLCONV
+UCPMapValueFilter(const void *context, uint32_t value);
+
+/**
+ * Returns the last code point such that all those from start to there have the same value.
+ * Can be used to efficiently iterate over all same-value ranges in a map.
+ * (This is normally faster than iterating over code points and get()ting each value,
+ * but much slower than a data structure that stores ranges directly.)
+ *
+ * If the UCPMapValueFilter function pointer is not NULL, then
+ * the value to be delivered is passed through that function, and the return value is the end
+ * of the range where all values are modified to the same actual value.
+ * The value is unchanged if that function pointer is NULL.
+ *
+ * Example:
+ * \code
+ * UChar32 start = 0, end;
+ * uint32_t value;
+ * while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ *                               NULL, NULL, &value)) >= 0) {
+ *     // Work with the range start..end and its value.
+ *     start = end + 1;
+ * }
+ * \endcode
+ *
+ * @param map the map
+ * @param start range start
+ * @param option defines whether surrogates are treated normally,
+ *               or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
+ * @param filter a pointer to a function that may modify the map data value,
+ *     or NULL if the values from the map are to be used unmodified
+ * @param context an opaque pointer that is passed on to the filter function
+ * @param pValue if not NULL, receives the value that every code point start..end has;
+ *     may have been modified by filter(context, map value)
+ *     if that function pointer is not NULL
+ * @return the range end code point, or -1 if start is not a valid code point
+ * @draft ICU 63
+ */
+U_CAPI UChar32 U_EXPORT2
+ucpmap_getRange(const UCPMap *map, UChar32 start,
+                UCPMapRangeOption option, uint32_t surrogateValue,
+                UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
+
+U_CDECL_END
+
+#endif  // U_HIDE_DRAFT_API
+#endif
diff --git a/icu4c/source/common/unicode/ucptrie.h b/icu4c/source/common/unicode/ucptrie.h

index 505995b3a733f0bfbac8c2947b5d900e0cbe6527..461c47a4f2eae73a1197755a0b0f25a34d75ab5e 100644 (file)
--- a/icu4c/source/common/unicode/ucptrie.h
+++ b/icu4c/source/common/unicode/ucptrie.h
@@ -8,10 +8,12 @@
  #define __UCPTRIE_H__
  
  #include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
  #include "unicode/localpointer.h"
+#include "unicode/ucpmap.h"
  #include "unicode/utf8.h"
-#include "putilimp.h"
-#include "udataswp.h"
  
  U_CDECL_BEGIN
  
@@ -174,54 +176,6 @@ enum UCPTrieValueWidth {
  typedef enum UCPTrieValueWidth UCPTrieValueWidth;
  #endif
  
-/**
- * Selectors for how ucptrie_getRange() should report value ranges overlapping with surrogates.
- * Most users should use UCPTRIE_RANGE_NORMAL.
- *
- * @see ucptrie_getRange
- * @draft ICU 63
- */
-enum UCPTrieRangeOption {
-    /**
-     * ucptrie_getRange() enumerates all same-value ranges as stored in the trie.
-     * Most users should use this option.
-     */
-    UCPTRIE_RANGE_NORMAL,
-    /**
-     * ucptrie_getRange() enumerates all same-value ranges as stored in the trie,
-     * except that lead surrogates (U+D800..U+DBFF) are treated as having the
-     * surrogateValue, which is passed to getRange() as a separate parameter.
-     * The surrogateValue is not transformed via filter().
-     * See U_IS_LEAD(c).
-     *
-     * Most users should use UCPTRIE_RANGE_NORMAL instead.
-     *
-     * This option is useful for tries that map surrogate code *units* to
-     * special values optimized for UTF-16 string processing
-     * or for special error behavior for unpaired surrogates,
-     * but those values are not to be associated with the lead surrogate code *points*.
-     */
-    UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
-    /**
-     * ucptrie_getRange() enumerates all same-value ranges as stored in the trie,
-     * except that all surrogates (U+D800..U+DFFF) are treated as having the
-     * surrogateValue, which is passed to getRange() as a separate parameter.
-     * The surrogateValue is not transformed via filter().
-     * See U_IS_SURROGATE(c).
-     *
-     * Most users should use UCPTRIE_RANGE_NORMAL instead.
-     *
-     * This option is useful for tries that map surrogate code *units* to
-     * special values optimized for UTF-16 string processing
-     * or for special error behavior for unpaired surrogates,
-     * but those values are not to be associated with the lead surrogate code *points*.
-     */
-    UCPTRIE_RANGE_FIXED_ALL_SURROGATES
-};
-#ifndef U_IN_DOXYGEN
-typedef enum UCPTrieRangeOption UCPTrieRangeOption;
-#endif
-
  /**
   * Opens a trie from its binary form, stored in 32-bit-aligned memory.
   * Inverse of ucptrie_toBinary().
@@ -322,30 +276,13 @@ ucptrie_getValueWidth(const UCPTrie *trie);
  U_CAPI uint32_t U_EXPORT2
  ucptrie_get(const UCPTrie *trie, UChar32 c);
  
-/**
- * Callback function type: Modifies a trie value.
- * Optionally called by ucptrie_getRange() or umutablecptrie_getRange().
- * The modified value will be returned by the getRange function.
- *
- * Can be used to ignore some of the value bits,
- * make a filter for one of several values,
- * return a value index computed from the trie value, etc.
- *
- * @param context an opaque pointer, as passed into the getRange function
- * @param value a value from the trie
- * @return the modified value
- * @draft ICU 63
- */
-typedef uint32_t U_CALLCONV
-UCPTrieValueFilter(const void *context, uint32_t value);
-
  /**
   * Returns the last code point such that all those from start to there have the same value.
   * Can be used to efficiently iterate over all same-value ranges in a trie.
   * (This is normally faster than iterating over code points and get()ting each value,
   * but much slower than a data structure that stores ranges directly.)
   *
- * If the UCPTrieValueFilter function pointer is not NULL, then
+ * If the UCPMapValueFilter function pointer is not NULL, then
   * the value to be delivered is passed through that function, and the return value is the end
   * of the range where all values are modified to the same actual value.
   * The value is unchanged if that function pointer is NULL.
@@ -354,7 +291,7 @@ UCPTrieValueFilter(const void *context, uint32_t value);
   * \code
   * UChar32 start = 0, end;
   * uint32_t value;
- * while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+ * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
   *                                NULL, NULL, &value)) >= 0) {
   *     // Work with the range start..end and its value.
   *     start = end + 1;
@@ -364,8 +301,8 @@ UCPTrieValueFilter(const void *context, uint32_t value);
   * @param trie the trie
   * @param start range start
   * @param option defines whether surrogates are treated normally,
- *               or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL
- * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL
+ *               or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
   * @param filter a pointer to a function that may modify the trie data value,
   *     or NULL if the values from the trie are to be used unmodified
   * @param context an opaque pointer that is passed on to the filter function
@@ -377,8 +314,8 @@ UCPTrieValueFilter(const void *context, uint32_t value);
   */
  U_CAPI UChar32 U_EXPORT2
  ucptrie_getRange(const UCPTrie *trie, UChar32 start,
-                 UCPTrieRangeOption option, uint32_t surrogateValue,
-                 UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+                 UCPMapRangeOption option, uint32_t surrogateValue,
+                 UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  
  /**
   * Writes a memory-mappable form of the trie into 32-bit aligned memory.
@@ -704,4 +641,5 @@ ucptrie_internalU8PrevIndex(const UCPTrie *trie, UChar32 c,
  U_CDECL_END
  
  #endif  // U_IN_DOXYGEN
+#endif  // U_HIDE_DRAFT_API
  #endif
diff --git a/icu4c/source/common/unicode/umutablecptrie.h b/icu4c/source/common/unicode/umutablecptrie.h

index 31d10f4d74f95feb117fbb913e52a3cbdcdf77bc..e75191a4495209fb4f98278fd5ee353ae8f84bd5 100644 (file)
--- a/icu4c/source/common/unicode/umutablecptrie.h
+++ b/icu4c/source/common/unicode/umutablecptrie.h
@@ -8,11 +8,13 @@
  #define __UMUTABLECPTRIE_H__
  
  #include "unicode/utypes.h"
+
+#ifndef U_HIDE_DRAFT_API
+
  #include "unicode/localpointer.h"
+#include "unicode/ucpmap.h"
  #include "unicode/ucptrie.h"
  #include "unicode/utf8.h"
-#include "putilimp.h"
-#include "udataswp.h"
  
  U_CDECL_BEGIN
  
@@ -102,6 +104,18 @@ U_NAMESPACE_END
  
  #endif
  
+/**
+ * Creates a mutable trie with the same contents as the UCPMap.
+ * You must umutablecptrie_close() the mutable trie once you are done using it.
+ *
+ * @param map the source map
+ * @param pErrorCode an in/out ICU UErrorCode
+ * @return the mutable trie
+ * @draft ICU 63
+ */
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode);
+
  /**
   * Creates a mutable trie with the same contents as the immutable one.
   * You must umutablecptrie_close() the mutable trie once you are done using it.
@@ -133,7 +147,7 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c);
   *
   * The trie can be modified between calls to this function.
   *
- * If the UCPTrieValueFilter function pointer is not NULL, then
+ * If the UCPMapValueFilter function pointer is not NULL, then
   * the value to be delivered is passed through that function, and the return value is the end
   * of the range where all values are modified to the same actual value.
   * The value is unchanged if that function pointer is NULL.
@@ -143,8 +157,8 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c);
   * @param trie the trie
   * @param start range start
   * @param option defines whether surrogates are treated normally,
- *               or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL
- * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL
+ *               or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
+ * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
   * @param filter a pointer to a function that may modify the trie data value,
   *     or NULL if the values from the trie are to be used unmodified
   * @param context an opaque pointer that is passed on to the filter function
@@ -156,8 +170,8 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c);
   */
  U_CAPI UChar32 U_EXPORT2
  umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
-                        UCPTrieRangeOption option, uint32_t surrogateValue,
-                        UCPTrieValueFilter *filter, const void *context, uint32_t *pValue);
+                        UCPMapRangeOption option, uint32_t surrogateValue,
+                        UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
  
  /**
   * Sets a value for a code point.
@@ -223,4 +237,5 @@ umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieVal
  
  U_CDECL_END
  
+#endif  // U_HIDE_DRAFT_API
  #endif
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h

index ed9a3eb72ff3aea0b49cd36e6c755a89c967e6b2..bd9aa5600df7d209f0b7c996d2fb3f5e3ba393be 100644 (file)
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -13,6 +13,7 @@
  #ifndef UNICODESET_H
  #define UNICODESET_H
  
+#include "unicode/ucpmap.h"
  #include "unicode/unifilt.h"
  #include "unicode/unistr.h"
  #include "unicode/uset.h"
@@ -25,9 +26,8 @@
  U_NAMESPACE_BEGIN
  
  // Forward Declarations.
-void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */
-
  class BMPSet;
+class CharacterProperties;
  class ParsePosition;
  class RBBIRuleScanner;
  class SymbolTable;
@@ -584,9 +584,8 @@ public:
      //----------------------------------------------------------------
  
      /**
-     * Make this object represent the range <code>start - end</code>.
-     * If <code>end > start</code> then this object is set to an
-     * an empty range.
+     * Make this object represent the range `start - end`.
+     * If `end > start` then this object is set to an empty range.
       * A frozen set will not be modified.
       *
       * @param start first character in the set, inclusive
@@ -1506,6 +1505,7 @@ private:
      //----------------------------------------------------------------
  
      UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
+    UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
  
      //----------------------------------------------------------------
      // Implementation: Pattern parsing
@@ -1614,7 +1614,7 @@ private:
                                UnicodeString& rebuiltPat,
                                UErrorCode& ec);
  
-    friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
+    friend class CharacterProperties;
      static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
  
      /**
@@ -1634,9 +1634,13 @@ private:
       */
      void applyFilter(Filter filter,
                       void* context,
-                     int32_t src,
+                     const UnicodeSet* inclusions,
                       UErrorCode &status);
  
+    void applyIntPropertyValue(const UCPMap *map,
+                               UCPMapValueFilter *filter, const void *context,
+                               UErrorCode &errorCode);
+
      /**
       * Set the new pattern to cache.
       */
diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h

index 59f46507d548aa48658f8478af79d5fd43a9f2cf..18482c10e738443f46eeaf47df65381dc7d8abb5 100644 (file)
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@@ -33,10 +33,14 @@
  #include "unicode/uchar.h"
  #include "unicode/localpointer.h"
  
-#ifndef UCNV_H
-struct USet;
+#ifndef USET_DEFINED
+
+#ifndef U_IN_DOXYGEN
+#define USET_DEFINED
+#endif
  /**
- * A UnicodeSet.  Use the uset_* API to manipulate.  Create with
+ * USet is the C API type corresponding to C++ class UnicodeSet.
+ * Use the uset_* API to manipulate.  Create with
   * uset_open*, and destroy with uset_close.
   * @stable ICU 2.4
   */
diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp

index 7206e63e8873a3ee0864efcd2fd35678b38237ab..e8378e0a223f2bfe805626b726392c6b829a37af 100644 (file)
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@@ -276,6 +276,10 @@ UnicodeSet::~UnicodeSet() {
   * Assigns this object to be a copy of another.
   */
  UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
+    return copyFrom(o, FALSE);
+}
+
+UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
      if (this == &o) {
          return *this;
      }
@@ -294,7 +298,7 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
      }
      len = o.len;
      uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
-    if (o.bmpSet == NULL) {
+    if (o.bmpSet == NULL || asThawed) {
          bmpSet = NULL;
      } else {
          bmpSet = new BMPSet(*o.bmpSet, list, len);
@@ -309,7 +313,7 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
          setToBogus();
          return *this;
      }
-    if (o.stringSpan == NULL) {
+    if (o.stringSpan == NULL || asThawed) {
          stringSpan = NULL;
      } else {
          stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp

index 7e5b132b69bc52ecb728d0ff3df98f672aa06d08..6cfd80a705b8fcb4911b94ab29c44d6c315e7653 100644 (file)
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -36,8 +36,6 @@
  #include "uprops.h"
  #include "propname.h"
  #include "normalizer2impl.h"
-#include "ucase.h"
-#include "ubidi_props.h"
  #include "uinvchar.h"
  #include "uprops.h"
  #include "charstr.h"
@@ -98,47 +96,13 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
  U_CDECL_BEGIN
  static UBool U_CALLCONV uset_cleanup();
  
-struct Inclusion {
-    UnicodeSet  *fSet;
-    UInitOnce    fInitOnce;
-};
-static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
-
  static UnicodeSet *uni32Singleton;
  static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
  
-//----------------------------------------------------------------
-// Inclusions list
-//----------------------------------------------------------------
-
-// USetAdder implementation
-// Does not use uset.h to reduce code dependencies
-static void U_CALLCONV
-_set_add(USet *set, UChar32 c) {
-    ((UnicodeSet *)set)->add(c);
-}
-
-static void U_CALLCONV
-_set_addRange(USet *set, UChar32 start, UChar32 end) {
-    ((UnicodeSet *)set)->add(start, end);
-}
-
-static void U_CALLCONV
-_set_addString(USet *set, const UChar *str, int32_t length) {
-    ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
-}
-
  /**
   * Cleanup function for UnicodeSet
   */
  static UBool U_CALLCONV uset_cleanup(void) {
-    for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
-        Inclusion &in = gInclusions[i];
-        delete in.fSet;
-        in.fSet = NULL;
-        in.fInitOnce.reset();
-    }
-
      delete uni32Singleton;
      uni32Singleton = NULL;
      uni32InitOnce.reset();
@@ -149,119 +113,6 @@ U_CDECL_END
  
  U_NAMESPACE_BEGIN
  
-/*
-Reduce excessive reallocation, and make it easier to detect initialization problems.
-Usually you don't see smaller sets than this for Unicode 5.0.
-*/
-#define DEFAULT_INCLUSION_CAPACITY 3072
-
-void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
-    // This function is invoked only via umtx_initOnce().
-    // This function is a friend of class UnicodeSet.
-
-    U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
-    UnicodeSet * &incl = gInclusions[src].fSet;
-    U_ASSERT(incl == NULL);
-
-    incl = new UnicodeSet();
-    if (incl == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-    USetAdder sa = {
-        (USet *)incl,
-        _set_add,
-        _set_addRange,
-        _set_addString,
-        NULL, // don't need remove()
-        NULL // don't need removeRange()
-    };
-
-    incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
-    switch(src) {
-    case UPROPS_SRC_CHAR:
-        uchar_addPropertyStarts(&sa, &status);
-        break;
-    case UPROPS_SRC_PROPSVEC:
-        upropsvec_addPropertyStarts(&sa, &status);
-        break;
-    case UPROPS_SRC_CHAR_AND_PROPSVEC:
-        uchar_addPropertyStarts(&sa, &status);
-        upropsvec_addPropertyStarts(&sa, &status);
-        break;
-#if !UCONFIG_NO_NORMALIZATION
-    case UPROPS_SRC_CASE_AND_NORM: {
-        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
-        if(U_SUCCESS(status)) {
-            impl->addPropertyStarts(&sa, status);
-        }
-        ucase_addPropertyStarts(&sa, &status);
-        break;
-    }
-    case UPROPS_SRC_NFC: {
-        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
-        if(U_SUCCESS(status)) {
-            impl->addPropertyStarts(&sa, status);
-        }
-        break;
-    }
-    case UPROPS_SRC_NFKC: {
-        const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
-        if(U_SUCCESS(status)) {
-            impl->addPropertyStarts(&sa, status);
-        }
-        break;
-    }
-    case UPROPS_SRC_NFKC_CF: {
-        const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
-        if(U_SUCCESS(status)) {
-            impl->addPropertyStarts(&sa, status);
-        }
-        break;
-    }
-    case UPROPS_SRC_NFC_CANON_ITER: {
-        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
-        if(U_SUCCESS(status)) {
-            impl->addCanonIterPropertyStarts(&sa, status);
-        }
-        break;
-    }
-#endif
-    case UPROPS_SRC_CASE:
-        ucase_addPropertyStarts(&sa, &status);
-        break;
-    case UPROPS_SRC_BIDI:
-        ubidi_addPropertyStarts(&sa, &status);
-        break;
-    case UPROPS_SRC_INPC:
-    case UPROPS_SRC_INSC:
-    case UPROPS_SRC_VO:
-        uprops_addPropertyStarts((UPropertySource)src, &sa, &status);
-        break;
-    default:
-        status = U_INTERNAL_PROGRAM_ERROR;
-        break;
-    }
-
-    if (U_FAILURE(status)) {
-        delete incl;
-        incl = NULL;
-        return;
-    }
-    // Compact for caching
-    incl->compact();
-    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
-}
-
-
-
-const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
-    U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
-    Inclusion &i = gInclusions[src];
-    umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
-    return i.fSet;
-}
-
  namespace {
  
  // Cache some sets for other services -------------------------------------- ***
@@ -862,11 +713,6 @@ static UBool numericValueFilter(UChar32 ch, void* context) {
      return u_getNumericValue(ch) == *(double*)context;
  }
  
-static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
-    int32_t value = *(int32_t*)context;
-    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
-}
-
  static UBool versionFilter(UChar32 ch, void* context) {
      static const UVersionInfo none = { 0, 0, 0, 0 };
      UVersionInfo v;
@@ -875,16 +721,6 @@ static UBool versionFilter(UChar32 ch, void* context) {
      return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
  }
  
-typedef struct {
-    UProperty prop;
-    int32_t value;
-} IntPropertyContext;
-
-static UBool intPropertyFilter(UChar32 ch, void* context) {
-    IntPropertyContext* c = (IntPropertyContext*)context;
-    return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
-}
-
  static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
      return uscript_hasScript(ch, *(UScriptCode*)context);
  }
@@ -896,7 +732,7 @@ static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
   */
  void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
                               void* context,
-                             int32_t src,
+                             const UnicodeSet* inclusions,
                               UErrorCode &status) {
      if (U_FAILURE(status)) return;
  
@@ -907,12 +743,8 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
      // To improve performance, use an inclusions set which
      // encodes information about character ranges that are known
      // to have identical properties.
-    // getInclusions(src) contains exactly the first characters of
-    // same-value ranges for the given properties "source".
-    const UnicodeSet* inclusions = getInclusions(src, status);
-    if (U_FAILURE(status)) {
-        return;
-    }
+    // inclusions contains the first characters of
+    // same-value ranges for the given property.
  
      clear();
  
@@ -949,6 +781,43 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
  
  namespace {
  
+/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
+uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) {
+    uint32_t mask = *(const uint32_t *)context;
+    value = U_MASK(value) & mask;
+    if (value != 0) { value = 1; }
+    return value;
+}
+
+/** Maps one map value to 1, all others to 0. */
+uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) {
+    uint32_t v = *(const uint32_t *)context;
+    return value == v ? 1 : 0;
+}
+
+}  // namespace
+
+void UnicodeSet::applyIntPropertyValue(const UCPMap *map,
+                                       UCPMapValueFilter *filter, const void *context,
+                                       UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) { return; }
+    clear();
+    UChar32 start = 0, end;
+    uint32_t value;
+    while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+                                  filter, context, &value)) >= 0) {
+        if (value != 0) {
+            add(start, end);
+        }
+        start = end + 1;
+    }
+    if (isBogus()) {
+        errorCode = U_MEMORY_ALLOCATION_ERROR;
+    }
+}
+
+namespace {
+
  static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
      /* Note: we use ' ' in compiler code page */
      int32_t j = 0;
@@ -976,16 +845,35 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
  
  UnicodeSet&
  UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
-    if (U_FAILURE(ec) || isFrozen()) return *this;
-
+    if (U_FAILURE(ec)) { return *this; }
+    // All of the following check isFrozen() before modifying this set.
      if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
-        applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
+        const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec);
+        applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec);
      } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
+        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
          UScriptCode script = (UScriptCode)value;
-        applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
+        applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
+    } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
+        if (value == 0 || value == 1) {
+            const USet *set = u_getBinaryPropertySet(prop, &ec);
+            if (U_FAILURE(ec)) { return *this; }
+            copyFrom(*UnicodeSet::fromUSet(set), TRUE);
+            if (value == 0) {
+                complement();
+            }
+        } else {
+            clear();
+        }
+    } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
+        const UCPMap *map = u_getIntPropertyMap(prop, &ec);
+        applyIntPropertyValue(map, intValueFilter, &value, ec);
      } else {
-        IntPropertyContext c = {prop, value};
-        applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
+        // This code used to always call getInclusions(property source)
+        // which sets an error for an unsupported property.
+        ec = U_ILLEGAL_ARGUMENT_ERROR;
+        // Otherwise we would just clear() this set because
+        // getIntPropertyValue(c, prop) returns 0 for all code points.
      }
      return *this;
  }
@@ -1061,7 +949,8 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
                      if (*end != 0) {
                          FAIL(ec);
                      }
-                    applyFilter(numericValueFilter, &val, UPROPS_SRC_CHAR, ec);
+                    applyFilter(numericValueFilter, &val,
+                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
                      return *this;
                  }
              case UCHAR_NAME:
@@ -1090,7 +979,8 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
                      if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
                      UVersionInfo version;
                      u_versionFromString(version, buf);
-                    applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
+                    applyFilter(versionFilter, &version,
+                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
                      return *this;
                  }
              case UCHAR_SCRIPT_EXTENSIONS:
diff --git a/icu4c/source/common/uprops.cpp b/icu4c/source/common/uprops.cpp

index 9738881b1dc81be13b89d8c5340e6313bf338a23..2421c15d2bd0b60babc4a01be327790f0a803dd3 100644 (file)
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@@ -605,7 +605,7 @@ uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *p
  
      // Add the start code point of each same-value range of the trie.
      UChar32 start = 0, end;
-    while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0,
+    while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
                                     nullptr, nullptr, nullptr)) >= 0) {
          sa->add(sa->set, start);
          start = end + 1;
diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h

index 0896973da323c1b7fe32607a0c5332f08fcef830..1a8e4e84f7445d2669193faaa2773b933f15d110 100644 (file)
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@@ -459,6 +459,13 @@ U_NAMESPACE_BEGIN
  
  class UnicodeSet;
  
+class CharacterProperties {
+public:
+    CharacterProperties() = delete;
+    static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode);
+    static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode);
+};
+
  // implemented in uniset_props.cpp
  U_CFUNC UnicodeSet *
  uniset_getUnicode32Instance(UErrorCode &errorCode);
diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c

index 4e18cfa002577ac408c8e50e3c6d2a84907f59c1..059bd72adad6b65d8efa0a887d106c8901fe4777 100644 (file)
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@@ -61,6 +61,8 @@ static void TestPropertyNames(void);
  static void TestPropertyValues(void);
  static void TestConsistency(void);
  static void TestCaseFolding(void);
+static void TestBinaryCharacterPropertiesAPI(void);
+static void TestIntCharacterPropertiesAPI(void);
  
  /* internal methods used */
  static int32_t MakeProp(char* str);
@@ -196,6 +198,10 @@ void addUnicodeTest(TestNode** root)
      addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
      addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
      addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
+    addTest(root, &TestBinaryCharacterPropertiesAPI,
+            "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
+    addTest(root, &TestIntCharacterPropertiesAPI,
+            "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
  }
  
  /*==================================================== */
@@ -3522,3 +3528,41 @@ TestCaseFolding() {
  
      uset_close(data.notSeen);
  }
+
+static void TestBinaryCharacterPropertiesAPI() {
+    // API test only. See intltest/ucdtest.cpp for functional test.
+    UErrorCode errorCode = U_ZERO_ERROR;
+    const USet *set = u_getBinaryPropertySet(-1, &errorCode);
+    if (U_SUCCESS(errorCode)) {
+        log_err("u_getBinaryPropertySet(-1) did not fail\n");
+    }
+    errorCode = U_ZERO_ERROR;
+    set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
+    if (U_SUCCESS(errorCode)) {
+        log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
+    }
+    errorCode = U_ZERO_ERROR;
+    set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
+    if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
+        log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
+    }
+}
+
+static void TestIntCharacterPropertiesAPI() {
+    // API test only. See intltest/ucdtest.cpp for functional test.
+    UErrorCode errorCode = U_ZERO_ERROR;
+    const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
+    if (U_SUCCESS(errorCode)) {
+        log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
+    }
+    errorCode = U_ZERO_ERROR;
+    map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
+    if (U_SUCCESS(errorCode)) {
+        log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
+    }
+    errorCode = U_ZERO_ERROR;
+    map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
+    if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
+        log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
+    }
+}
diff --git a/icu4c/source/test/cintltst/ucptrietest.c b/icu4c/source/test/cintltst/ucptrietest.c

index 9969a62937a5c745498efe7964a40819ab9a73f9..299ef900b834d10c5615c8a4834e6fe35b5ce166 100644 (file)
--- a/icu4c/source/test/cintltst/ucptrietest.c
+++ b/icu4c/source/test/cintltst/ucptrietest.c
@@ -107,11 +107,11 @@ static UChar32 iterStarts[] = {
  
  static void
  testTrieGetRanges(const char *testName, const UCPTrie *trie, const UMutableCPTrie *mutableTrie,
-                  UCPTrieRangeOption option, uint32_t surrValue,
+                  UCPMapRangeOption option, uint32_t surrValue,
                    const CheckRange checkRanges[], int32_t countCheckRanges) {
      const char *const typeName = trie == NULL ? "mutableTrie" : "trie";
-    const char *const optionName = option == UCPTRIE_RANGE_NORMAL ? "normal" :
-        option == UCPTRIE_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr";
+    const char *const optionName = option == UCPMAP_RANGE_NORMAL ? "normal" :
+        option == UCPMAP_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr";
      char name[80];
      int32_t s;
      for (s = 0; s < UPRV_LENGTHOF(iterStarts); ++s) {
@@ -690,7 +690,7 @@ testTrie(const char *testName, const UCPTrie *trie,
           UCPTrieType type, UCPTrieValueWidth valueWidth,
           const CheckRange checkRanges[], int32_t countCheckRanges) {
      testTrieGetters(testName, trie, type, valueWidth, checkRanges, countCheckRanges);
-    testTrieGetRanges(testName, trie, NULL, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
+    testTrieGetRanges(testName, trie, NULL, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
      if (type == UCPTRIE_TYPE_FAST) {
          testTrieUTF16(testName, trie, valueWidth, checkRanges, countCheckRanges);
          testTrieUTF8(testName, trie, valueWidth, checkRanges, countCheckRanges);
@@ -701,7 +701,7 @@ static void
  testBuilder(const char *testName, const UMutableCPTrie *mutableTrie,
              const CheckRange checkRanges[], int32_t countCheckRanges) {
      testBuilderGetters(testName, mutableTrie, checkRanges, countCheckRanges);
-    testTrieGetRanges(testName, NULL, mutableTrie, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
+    testTrieGetRanges(testName, NULL, mutableTrie, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges);
  }
  
  static uint32_t storage[120000];
@@ -1366,7 +1366,7 @@ MuchDataTest(void) {
  }
  
  static void testGetRangesFixedSurr(const char *testName, const UMutableCPTrie *mutableTrie,
-                                   UCPTrieRangeOption option,
+                                   UCPMapRangeOption option,
                                     const CheckRange checkRanges[], int32_t countCheckRanges) {
      testTrieGetRanges(testName, NULL, mutableTrie, option, 5, checkRanges, countCheckRanges);
      UErrorCode errorCode = U_ZERO_ERROR;
@@ -1454,9 +1454,9 @@ TrieTestGetRangesFixedSurr(void) {
      if (mutableTrie == NULL) {
          return;
      }
-    testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+    testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
                             checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1));
-    testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+    testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
                             checkRangesFixedAllSurr1, UPRV_LENGTHOF(checkRangesFixedAllSurr1));
      // Setting a range in the middle of lead surrogates makes no difference.
      umutablecptrie_setRange(mutableTrie, 0xd844, 0xd899, 5, &errorCode);
@@ -1465,7 +1465,7 @@ TrieTestGetRangesFixedSurr(void) {
          umutablecptrie_close(mutableTrie);
          return;
      }
-    testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+    testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
                             checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1));
      // Bridge the gap before the lead surrogates.
      umutablecptrie_set(mutableTrie, 0xd7ff, 5, &errorCode);
@@ -1474,9 +1474,9 @@ TrieTestGetRangesFixedSurr(void) {
          umutablecptrie_close(mutableTrie);
          return;
      }
-    testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES,
+    testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
                             checkRangesFixedLeadSurr3, UPRV_LENGTHOF(checkRangesFixedLeadSurr3));
-    testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+    testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
                             checkRangesFixedAllSurr3, UPRV_LENGTHOF(checkRangesFixedAllSurr3));
      // Bridge the gap after the trail surrogates.
      umutablecptrie_set(mutableTrie, 0xe000, 5, &errorCode);
@@ -1485,7 +1485,7 @@ TrieTestGetRangesFixedSurr(void) {
          umutablecptrie_close(mutableTrie);
          return;
      }
-    testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES,
+    testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES,
                             checkRangesFixedSurr4, UPRV_LENGTHOF(checkRangesFixedSurr4));
      umutablecptrie_close(mutableTrie);
  }
diff --git a/icu4c/source/test/intltest/ucdtest.cpp b/icu4c/source/test/intltest/ucdtest.cpp

index 26a1d23ab67861df154e11e52aef6ba75b81d7af..cdad0ae7c987b50917783a9dafccc0de75315480 100644 (file)
--- a/icu4c/source/test/intltest/ucdtest.cpp
+++ b/icu4c/source/test/intltest/ucdtest.cpp
@@ -7,13 +7,16 @@
  
  #include "unicode/ustring.h"
  #include "unicode/uchar.h"
+#include "unicode/ucpmap.h"
  #include "unicode/uniset.h"
  #include "unicode/putil.h"
  #include "unicode/uscript.h"
+#include "unicode/uset.h"
  #include "cstring.h"
  #include "hash.h"
  #include "patternprops.h"
  #include "normalizer2impl.h"
+#include "testutil.h"
  #include "uparse.h"
  #include "ucdtest.h"
  
@@ -67,6 +70,8 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
      TESTCASE_AUTO(TestVerticalOrientation);
      TESTCASE_AUTO(TestDefaultScriptExtensions);
      TESTCASE_AUTO(TestInvalidCodePointFolding);
+    TESTCASE_AUTO(TestBinaryCharacterProperties);
+    TESTCASE_AUTO(TestIntCharacterProperties);
      TESTCASE_AUTO_END;
  }
  
@@ -615,3 +620,73 @@ void UnicodeTest::TestInvalidCodePointFolding(void) {
                  cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
      }
  }
+
+void UnicodeTest::TestBinaryCharacterProperties() {
+    IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
+    // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
+    for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
+        const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
+        if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
+            continue;
+        }
+        const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
+        int32_t size = set.size();
+        if (size == 0) {
+            assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
+                u_hasBinaryProperty(0x20, (UProperty)prop));
+            assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
+                u_hasBinaryProperty(0x61, (UProperty)prop));
+            assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
+                u_hasBinaryProperty(0x4e00, (UProperty)prop));
+        } else {
+            UChar32 c = set.charAt(0);
+            if (c > 0) {
+                assertFalse(
+                    UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
+                        u", " + prop + u")",
+                    u_hasBinaryProperty(c - 1, (UProperty)prop));
+            }
+            assertTrue(
+                UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
+                    u", " + prop + u")",
+                u_hasBinaryProperty(c, (UProperty)prop));
+            c = set.charAt(size - 1);
+            assertTrue(
+                UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
+                    u", " + prop + u")",
+                u_hasBinaryProperty(c, (UProperty)prop));
+            if (c < 0x10ffff) {
+                assertFalse(
+                    UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
+                        u", " + prop + u")",
+                    u_hasBinaryProperty(c + 1, (UProperty)prop));
+            }
+        }
+    }
+}
+
+void UnicodeTest::TestIntCharacterProperties() {
+    IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
+    // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
+    for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
+        const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
+        if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
+            continue;
+        }
+        uint32_t value;
+        UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
+        assertTrue("int property first range", end >= 0);
+        UChar32 c = end / 2;
+        assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
+            u_getIntPropertyValue(c, (UProperty)prop), value);
+        end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
+        assertTrue("int property later range", end >= 0);
+        assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
+            u_getIntPropertyValue(end, (UProperty)prop), value);
+        // ucpmap_get() API coverage
+        // TODO: move to cucdtst.c
+        assertEquals(
+            "int property upcmap_get(U+0061)",
+            u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
+    }
+}
diff --git a/icu4c/source/test/intltest/ucdtest.h b/icu4c/source/test/intltest/ucdtest.h

index 1fe75e7eb9391fa45ed5a177c4109bbb5c6df8c7..2ed1395b50f4e9268d268a22da61a7e602d1b254 100644 (file)
--- a/icu4c/source/test/intltest/ucdtest.h
+++ b/icu4c/source/test/intltest/ucdtest.h
@@ -46,6 +46,8 @@ public:
      void TestVerticalOrientation();
      void TestDefaultScriptExtensions();
      void TestInvalidCodePointFolding();
+    void TestBinaryCharacterProperties();
+    void TestIntCharacterProperties();
  
  private:
  
diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp

index f3b7cafc39e681684968963c500af6f918130dff..1d3888523121ef645c5721ca9dca23bd488d2b2e 100644 (file)
--- a/icu4c/source/tools/gennorm2/n2builder.cpp
+++ b/icu4c/source/tools/gennorm2/n2builder.cpp
@@ -650,7 +650,7 @@ LocalUCPTriePointer Normalizer2DataBuilder::processData() {
      // First check that surrogate code *points* are inert.
      // The parser should have rejected values/mappings for them.
      uint32_t value;
-    UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPTRIE_RANGE_NORMAL, 0,
+    UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
                                            nullptr, nullptr, &value);
      if (value != Normalizer2Impl::INERT || end < 0xdfff) {
          fprintf(stderr,
@@ -665,7 +665,7 @@ LocalUCPTriePointer Normalizer2DataBuilder::processData() {
      end = 0;
      for (UChar32 start = 0x10000;;) {
          if (start > end) {
-            end = umutablecptrie_getRange(norm16Trie, start, UCPTRIE_RANGE_NORMAL, 0,
+            end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
                                            nullptr, nullptr, &value);
              if (end < 0) { break; }
          }
diff --git a/icu4c/source/tools/gennorm2/norms.cpp b/icu4c/source/tools/gennorm2/norms.cpp

index da7e2a80917d82d0517e49c84a86bb9b64c767cf..96692f233cc09f163dc876aa9b3a4128256504aa 100644 (file)
--- a/icu4c/source/tools/gennorm2/norms.cpp
+++ b/icu4c/source/tools/gennorm2/norms.cpp
@@ -156,7 +156,7 @@ UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t high
  void Norms::enumRanges(Enumerator &e) {
      UChar32 start = 0, end;
      uint32_t i;
-    while ((end = umutablecptrie_getRange(normTrie, start, UCPTRIE_RANGE_NORMAL, 0,
+    while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
                                            nullptr, nullptr, &i)) >= 0) {
          if (i > 0) {
              e.rangeHandler(start, end, norms[i]);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java

new file mode 100644 (file)

index 0000000..41b005b
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java
@@ -0,0 +1,86 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+package com.ibm.icu.impl;
+
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Properties functionality above class UCharacterProperty
+ * but below class CharacterProperties and class UnicodeSet.
+ */
+public final class CharacterPropertiesImpl {
+    /**
+     * A set of all characters _except_ the second through last characters of
+     * certain ranges. These ranges are ranges of characters whose
+     * properties are all exactly alike, e.g. CJK Ideographs from
+     * U+4E00 to U+9FA5.
+     */
+    private static final UnicodeSet inclusions[] = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+
+    /** For {@link UnicodeSet#setDefaultXSymbolTable}. */
+    public static synchronized void clear() {
+        for (int i = 0; i < inclusions.length; ++i) {
+            inclusions[i] = null;
+        }
+    }
+
+    private static synchronized UnicodeSet getInclusionsForSource(int src) {
+        if (inclusions[src] == null) {
+            UnicodeSet incl = new UnicodeSet();
+            switch(src) {
+            case UCharacterProperty.SRC_CHAR:
+                UCharacterProperty.INSTANCE.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_PROPSVEC:
+                UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
+                UCharacterProperty.INSTANCE.addPropertyStarts(incl);
+                UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_CASE_AND_NORM:
+                Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
+                UCaseProps.INSTANCE.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_NFC:
+                Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_NFKC:
+                Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_NFKC_CF:
+                Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_NFC_CANON_ITER:
+                Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_CASE:
+                UCaseProps.INSTANCE.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_BIDI:
+                UBiDiProps.INSTANCE.addPropertyStarts(incl);
+                break;
+            case UCharacterProperty.SRC_INPC:
+            case UCharacterProperty.SRC_INSC:
+            case UCharacterProperty.SRC_VO:
+                UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl);
+                break;
+            default:
+                throw new IllegalStateException("getInclusions(unknown src " + src + ")");
+            }
+            // We do not freeze() the set because we only iterate over it,
+            // rather than testing contains(),
+            // so the extra time and memory to optimize that are not necessary.
+            inclusions[src] = incl;
+        }
+        return inclusions[src];
+    }
+
+    /**
+     * Returns a mutable UnicodeSet -- do not modify!
+     */
+    public static UnicodeSet getInclusionsForProperty(int prop) {
+        int src = UCharacterProperty.INSTANCE.getSource(prop);
+        return getInclusionsForSource(src);
+    }
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java

index dad93aaed1a51d994aab8317a91d84e4e4c6a225..12c53d6e0030d357f86e8fe32f9ee20a712b808b 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
@@ -1535,7 +1535,7 @@ public final class UCharacterProperty
          return -1; // undefined
      }
  
-    public final int getSource(int which) {
+    final int getSource(int which) {
          if(which<UProperty.BINARY_START) {
              return SRC_NONE; /* undefined */
          } else if(which<UProperty.BINARY_LIMIT) {
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/CharacterProperties.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/CharacterProperties.java

new file mode 100644 (file)

index 0000000..ea597a7
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/CharacterProperties.java
@@ -0,0 +1,158 @@
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+
+package com.ibm.icu.lang;
+
+import com.ibm.icu.impl.CharacterPropertiesImpl;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.CodePointMap;
+import com.ibm.icu.util.CodePointTrie;
+import com.ibm.icu.util.MutableCodePointTrie;
+
+/**
+ * Sets and maps for Unicode properties.
+ * The methods here return an object per property:
+ * A set for each ICU-supported binary property with all code points for which the property is true.
+ * A map for each ICU-supported enumerated/catalog/int-valued property
+ * which maps all Unicode code points to their values for that property.
+ *
+ * <p>For details see the method descriptions.
+ * For lookup of property values by code point see class {@link UCharacter}.
+ *
+ * @draft ICU 63
+ * @provisional This API might change or be removed in a future release.
+ */
+public final class CharacterProperties {
+    private CharacterProperties() {}  // all-static
+
+    private static final UnicodeSet sets[] = new UnicodeSet[UProperty.BINARY_LIMIT];
+    private static final CodePointMap maps[] = new CodePointMap[UProperty.INT_LIMIT - UProperty.INT_START];
+
+    private static UnicodeSet makeSet(int property) {
+        UnicodeSet set = new UnicodeSet();
+        UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
+        int numRanges = inclusions.getRangeCount();
+        int startHasProperty = -1;
+
+        for (int i = 0; i < numRanges; ++i) {
+            int rangeEnd = inclusions.getRangeEnd(i);
+            for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) {
+                // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
+                if (UCharacter.hasBinaryProperty(c, property)) {
+                    if (startHasProperty < 0) {
+                        // Transition from false to true.
+                        startHasProperty = c;
+                    }
+                } else if (startHasProperty >= 0) {
+                    // Transition from true to false.
+                    set.add(startHasProperty, c - 1);
+                    startHasProperty = -1;
+                }
+            }
+        }
+        if (startHasProperty >= 0) {
+            set.add(startHasProperty, 0x10FFFF);
+        }
+
+        return set.freeze();
+    }
+
+    private static CodePointMap makeMap(int property) {
+        int nullValue = property == UProperty.SCRIPT ? UScript.UNKNOWN : 0;
+        MutableCodePointTrie mutableTrie = new MutableCodePointTrie(nullValue, nullValue);
+        UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property);
+        int numRanges = inclusions.getRangeCount();
+        int start = 0;
+        int value = nullValue;
+
+        for (int i = 0; i < numRanges; ++i) {
+            int rangeEnd = inclusions.getRangeEnd(i);
+            for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) {
+                // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
+                int nextValue = UCharacter.getIntPropertyValue(c, property);
+                if (value != nextValue) {
+                    if (value != nullValue) {
+                        mutableTrie.setRange(start, c - 1, value);
+                    }
+                    start = c;
+                    value = nextValue;
+                }
+            }
+        }
+        if (value != 0) {
+            mutableTrie.setRange(start, 0x10FFFF, value);
+        }
+
+        CodePointTrie.Type type;
+        if (property == UProperty.BIDI_CLASS || property == UProperty.GENERAL_CATEGORY) {
+            type = CodePointTrie.Type.FAST;
+        } else {
+            type = CodePointTrie.Type.SMALL;
+        }
+        CodePointTrie.ValueWidth valueWidth;
+        // TODO: UCharacterProperty.IntProperty
+        int max = UCharacter.getIntPropertyMaxValue(property);
+        if (max <= 0xff) {
+            valueWidth = CodePointTrie.ValueWidth.BITS_8;
+        } else if (max <= 0xffff) {
+            valueWidth = CodePointTrie.ValueWidth.BITS_16;
+        } else {
+            valueWidth = CodePointTrie.ValueWidth.BITS_32;
+        }
+        return mutableTrie.buildImmutable(type, valueWidth);
+    }
+
+    /**
+     * Returns a frozen UnicodeSet for a binary property.
+     * Throws an exception if the property number is not one for a binary property.
+     *
+     * <p>The returned set contains all code points for which the property is true.
+     *
+     * @param property {@link UProperty#BINARY_START}..{@link UProperty#BINARY_LIMIT}-1
+     * @return the property as a set
+     * @see UProperty
+     * @see UCharacter#hasBinaryProperty
+     */
+    public static final UnicodeSet getBinaryPropertySet(int property) {
+        if (property < 0 || UProperty.BINARY_LIMIT <= property) {
+            throw new IllegalArgumentException("" + property +
+                    " is not a constant for a UProperty binary property");
+        }
+        synchronized(sets) {
+            UnicodeSet set = sets[property];
+            if (set == null) {
+                sets[property] = set = makeSet(property);
+            }
+            return set;
+        }
+    }
+
+    /**
+     * Returns an immutable CodePointMap for an enumerated/catalog/int-valued property.
+     * Throws an exception if the property number is not one for an "int property".
+     *
+     * <p>The returned object maps all Unicode code points to their values for that property.
+     * For documentation of the integer values see {@link UCharacter#getIntPropertyValue(int, int)}.
+     *
+     * <p>The actual type of the returned object differs between properties
+     * and may change over time.
+     *
+     * @param property {@link UProperty#INT_START}..{@link UProperty#INT_LIMIT}-1
+     * @return the property as a map
+     * @see UProperty
+     * @see UCharacter#getIntPropertyValue
+     */
+    public static final CodePointMap getIntPropertyMap(int property) {
+        if (property < UProperty.INT_START || UProperty.INT_LIMIT <= property) {
+            throw new IllegalArgumentException("" + property +
+                    " is not a constant for a UProperty int property");
+        }
+        synchronized(maps) {
+            CodePointMap map = maps[property - UProperty.INT_START];
+            if (map == null) {
+                maps[property - UProperty.INT_START] = map = makeMap(property);
+            }
+            return map;
+        }
+    }
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java

index 184528b151ccf34451ca5bb9939b2164f5021872..a738359fb92be84ddae7da5587f6e358d53fedb1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
@@ -5698,7 +5698,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      }
  
      /**
-     * {@icu} <p>Check a binary Unicode property for a code point.
+     * {@icu} Check a binary Unicode property for a code point.
       * <p>Unicode, especially in version 3.2, defines many more properties
       * than the original set in UnicodeData.txt.
       * <p>This API is intended to reflect Unicode properties as defined in
@@ -5720,6 +5720,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       *         Unicode version does not have data for the property at all, or
       *         not for this code point.
       * @see com.ibm.icu.lang.UProperty
+     * @see CharacterProperties#getBinaryPropertySet(int)
       * @stable ICU 2.6
       */
      public static boolean hasBinaryProperty(int ch, int property)
@@ -5777,7 +5778,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
      }
  
      /**
-     * {@icu} <p>Returns the property value for an Unicode property type of a code point.
+     * {@icu} Returns the property value for a Unicode property type of a code point.
       * Also returns binary and mask property values.
       * <p>Unicode, especially in version 3.2, defines many more properties than
       * the original set in UnicodeData.txt.
@@ -5801,8 +5802,9 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       *        UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
       * @return numeric value that is directly the property value or,
       *         for enumerated properties, corresponds to the numeric value of
-     *         the enumerated constant of the respective property value
-     *         enumeration type (cast to enum type if necessary).
+     *         the enumerated constant of the respective property value type
+     *         ({@link ECharacterCategory}, {@link ECharacterDirection},
+     *         {@link DecompositionType}, etc.).
       *         Returns 0 or 1 (for false / true) for binary Unicode properties.
       *         Returns a bit-mask for mask properties.
       *         Returns 0 if 'type' is out of bounds or if the Unicode version
@@ -5812,6 +5814,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       * @see #hasBinaryProperty
       * @see #getIntPropertyMinValue
       * @see #getIntPropertyMaxValue
+     * @see CharacterProperties#getIntPropertyMap(int)
       * @see #getUnicodeVersion
       * @stable ICU 2.4
       */
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index a5afcbfb3f70e2961bf73c74f8c959bcbed86351..91143499660c99373fe69ffba4d138a49820b6a0 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -18,21 +18,21 @@ import java.util.NoSuchElementException;
  import java.util.TreeSet;
  
  import com.ibm.icu.impl.BMPSet;
-import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.CharacterPropertiesImpl;
  import com.ibm.icu.impl.PatternProps;
  import com.ibm.icu.impl.RuleCharacterIterator;
  import com.ibm.icu.impl.SortedSetRelation;
  import com.ibm.icu.impl.StringRange;
-import com.ibm.icu.impl.UBiDiProps;
  import com.ibm.icu.impl.UCaseProps;
-import com.ibm.icu.impl.UCharacterProperty;
  import com.ibm.icu.impl.UPropertyAliases;
  import com.ibm.icu.impl.UnicodeSetStringSpan;
  import com.ibm.icu.impl.Utility;
  import com.ibm.icu.lang.CharSequences;
+import com.ibm.icu.lang.CharacterProperties;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UProperty;
  import com.ibm.icu.lang.UScript;
+import com.ibm.icu.util.CodePointMap;
  import com.ibm.icu.util.Freezable;
  import com.ibm.icu.util.ICUUncheckedIOException;
  import com.ibm.icu.util.OutputInt;
@@ -346,14 +346,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
      private static final String ASSIGNED = "Assigned"; // [:^Cn:]
  
-    /**
-     * A set of all characters _except_ the second through last characters of
-     * certain ranges.  These ranges are ranges of characters whose
-     * properties are all exactly alike, e.g. CJK Ideographs from
-     * U+4E00 to U+9FA5.
-     */
-    private static UnicodeSet INCLUSIONS[] = null;
-
      private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
      private volatile UnicodeSetStringSpan stringSpan;
      //----------------------------------------------------------------
@@ -520,8 +512,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
  
      /**
       * Make this object represent the range <code>start - end</code>.
-     * If <code>end &gt; start</code> then this object is set to an
-     * an empty range.
+     * If <code>end &gt; start</code> then this object is set to an empty range.
       *
       * @param start first character in the set, inclusive
       * @param end last character in the set, inclusive
@@ -3186,7 +3177,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          boolean contains(int codePoint);
      }
  
-    private static class NumericValueFilter implements Filter {
+    private static final class NumericValueFilter implements Filter {
          double value;
          NumericValueFilter(double value) { this.value = value; }
          @Override
@@ -3195,29 +3186,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          }
      }
  
-    private static class GeneralCategoryMaskFilter implements Filter {
-        int mask;
-        GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
-        @Override
-        public boolean contains(int ch) {
-            return ((1 << UCharacter.getType(ch)) & mask) != 0;
-        }
-    }
-
-    private static class IntPropertyFilter implements Filter {
-        int prop;
-        int value;
-        IntPropertyFilter(int prop, int value) {
-            this.prop = prop;
-            this.value = value;
-        }
-        @Override
-        public boolean contains(int ch) {
-            return UCharacter.getIntPropertyValue(ch, prop) == value;
-        }
-    }
-
-    private static class ScriptExtensionsFilter implements Filter {
+    private static final class ScriptExtensionsFilter implements Filter {
          int script;
          ScriptExtensionsFilter(int script) { this.script = script; }
          @Override
@@ -3229,7 +3198,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      // VersionInfo for unassigned characters
      private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
  
-    private static class VersionFilter implements Filter {
+    private static final class VersionFilter implements Filter {
          VersionInfo version;
          VersionFilter(VersionInfo version) { this.version = version; }
          @Override
@@ -3242,62 +3211,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          }
      }
  
-    private static synchronized UnicodeSet getInclusions(int src) {
-        if (INCLUSIONS == null) {
-            INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
-        }
-        if(INCLUSIONS[src] == null) {
-            UnicodeSet incl = new UnicodeSet();
-            switch(src) {
-            case UCharacterProperty.SRC_CHAR:
-                UCharacterProperty.INSTANCE.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_PROPSVEC:
-                UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
-                UCharacterProperty.INSTANCE.addPropertyStarts(incl);
-                UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_CASE_AND_NORM:
-                Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
-                UCaseProps.INSTANCE.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_NFC:
-                Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_NFKC:
-                Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_NFKC_CF:
-                Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_NFC_CANON_ITER:
-                Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_CASE:
-                UCaseProps.INSTANCE.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_BIDI:
-                UBiDiProps.INSTANCE.addPropertyStarts(incl);
-                break;
-            case UCharacterProperty.SRC_INPC:
-            case UCharacterProperty.SRC_INSC:
-            case UCharacterProperty.SRC_VO:
-                UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl);
-                break;
-            default:
-                throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
-            }
-            INCLUSIONS[src] = incl;
-        }
-        return INCLUSIONS[src];
-    }
-
      /**
       * Generic filter-based scanning code for UCD property UnicodeSets.
       */
-    private UnicodeSet applyFilter(Filter filter, int src) {
+    private void applyFilter(Filter filter, UnicodeSet inclusions) {
          // Logically, walk through all Unicode characters, noting the start
          // and end of each range for which filter.contain(c) is
          // true.  Add each range to a set.
@@ -3305,13 +3222,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          // To improve performance, use an inclusions set which
          // encodes information about character ranges that are known
          // to have identical properties.
-        // getInclusions(src) contains exactly the first characters of
-        // same-value ranges for the given properties "source".
+        // inclusions contains the first characters of
+        // same-value ranges for the given property.
  
          clear();
  
          int startHasProperty = -1;
-        UnicodeSet inclusions = getInclusions(src);
          int limitRange = inclusions.getRangeCount();
  
          for (int j=0; j<limitRange; ++j) {
@@ -3336,10 +3252,39 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
          if (startHasProperty >= 0) {
              add_unchecked(startHasProperty, 0x10FFFF);
          }
+    }
  
-        return this;
+    /** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */
+    private static final class GeneralCategoryMaskFilter implements CodePointMap.ValueFilter {
+        int mask;
+        GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
+        @Override
+        public int apply(int value) {
+            value = (1 << value) & mask;
+            if (value != 0) { value = 1; }
+            return value;
+        }
      }
  
+    /** Maps one map value to 1, all others to 0. */
+    private static final class IntValueFilter implements CodePointMap.ValueFilter {
+        int v;
+        IntValueFilter(int value) { v = value; }
+        @Override
+        public int apply(int value) { return value == v ? 1 : 0; }
+    }
+
+    private void applyIntPropertyValue(CodePointMap map, CodePointMap.ValueFilter filter) {
+        clear();
+        CodePointMap.Range range = new CodePointMap.Range();
+        for (int start = 0; map.getRange(start, filter, range);) {
+            int end = range.getEnd();
+            if (range.getValue() != 0) {
+                add_unchecked(start, end);
+            }
+            start = end + 1;
+        }
+    }
  
      /**
       * Remove leading and trailing Pattern_White_Space and compress
@@ -3393,13 +3338,31 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * @stable ICU 2.4
       */
      public UnicodeSet applyIntPropertyValue(int prop, int value) {
-        checkFrozen();
+        // All of the following include checkFrozen() before modifying this set.
          if (prop == UProperty.GENERAL_CATEGORY_MASK) {
-            applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
+            CodePointMap map = CharacterProperties.getIntPropertyMap(UProperty.GENERAL_CATEGORY);
+            applyIntPropertyValue(map, new GeneralCategoryMaskFilter(value));
          } else if (prop == UProperty.SCRIPT_EXTENSIONS) {
-            applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC);
+            UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
+            applyFilter(new ScriptExtensionsFilter(value), inclusions);
+        } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) {
+            if (value == 0 || value == 1) {
+                set(CharacterProperties.getBinaryPropertySet(prop));
+                if (value == 0) {
+                    complement();
+                }
+            } else {
+                clear();
+            }
+        } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) {
+            CodePointMap map = CharacterProperties.getIntPropertyMap(prop);
+            applyIntPropertyValue(map, new IntValueFilter(value));
          } else {
-            applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop));
+            // This code used to always call getInclusions(property source)
+            // which throws an exception for an unsupported property.
+            throw new IllegalArgumentException("unsupported property " + prop);
+            // Otherwise we would just clear() this set because
+            // getIntPropertyValue(c, prop) returns 0 for all code points.
          }
          return this;
      }
@@ -3499,7 +3462,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
                  case UProperty.NUMERIC_VALUE:
                  {
                      double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
-                    applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
+                    applyFilter(new NumericValueFilter(value),
+                            CharacterPropertiesImpl.getInclusionsForProperty(p));
                      return this;
                  }
                  case UProperty.NAME:
@@ -3525,7 +3489,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
                      // VersionInfo.getInstance() does not do
                      // 'loose' matching.
                      VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
-                    applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
+                    applyFilter(new VersionFilter(version),
+                            CharacterPropertiesImpl.getInclusionsForProperty(p));
                      return this;
                  }
                  case UProperty.SCRIPT_EXTENSIONS:
@@ -4881,7 +4846,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       * of UnicodeSets.
       * <p>
       * WARNING: If this function is used with a UnicodeProperty, and the
-     * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call
+     * Unassigned characters (gc=Cn) are different than in ICU, you MUST call
       * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
       * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
       *
@@ -4891,7 +4856,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
       */
      @Deprecated
      public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
-        INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
+        // If the properties override inclusions, these have to be regenerated.
+        // TODO: Check if the Unicode Tools or Unicode Utilities really need this.
+        CharacterPropertiesImpl.clear();
          XSYMBOL_TABLE = xSymbolTable;
      }
  }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java

index d7e4d91360663e8e68457f8ed176e102faa7583c..fc8e89dd33d6840839f6d1363d678c31e7c09d0a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
@@ -25,6 +25,7 @@ import com.ibm.icu.impl.Normalizer2Impl;
  import com.ibm.icu.impl.PatternProps;
  import com.ibm.icu.impl.UCharacterName;
  import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.CharacterProperties;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UCharacterCategory;
  import com.ibm.icu.lang.UCharacterDirection;
@@ -35,6 +36,7 @@ import com.ibm.icu.text.Normalizer2;
  import com.ibm.icu.text.UTF16;
  import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.CodePointMap;
  import com.ibm.icu.util.RangeValueIterator;
  import com.ibm.icu.util.ULocale;
  import com.ibm.icu.util.ValueIterator;
@@ -3641,4 +3643,67 @@ public final class UCharacterTest extends TestFmwk
          int output = UCharacter.getCharFromNameAlias(alias);
          assertEquals("alias for '" + input + "'", input, output);
      }
+
+    @Test
+    public void TestBinaryCharacterProperties() {
+        try {
+            CharacterProperties.getBinaryPropertySet(-1);
+            fail("getBinaryPropertySet(-1) did not throw an exception");
+            CharacterProperties.getBinaryPropertySet(UProperty.BINARY_LIMIT);
+            fail("getBinaryPropertySet(BINARY_LIMIT) did not throw an exception");
+        } catch(Exception expected) {
+        }
+        // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
+        for (int prop = 0; prop < UProperty.BINARY_LIMIT; ++prop) {
+            UnicodeSet set = CharacterProperties.getBinaryPropertySet(prop);
+            int size = set.size();
+            if (size == 0) {
+                assertFalse("!hasBinaryProperty(U+0020, " + prop + ')',
+                        UCharacter.hasBinaryProperty(0x20, prop));
+                assertFalse("!hasBinaryProperty(U+0061, " + prop + ')',
+                        UCharacter.hasBinaryProperty(0x61, prop));
+                assertFalse("!hasBinaryProperty(U+4E00, " + prop + ')',
+                        UCharacter.hasBinaryProperty(0x4e00, prop));
+            } else {
+                int c = set.charAt(0);
+                if (c > 0) {
+                    assertFalse("!hasBinaryProperty(" + Utility.hex(c - 1) + ", " + prop + ')',
+                            UCharacter.hasBinaryProperty(c - 1, prop));
+                }
+                assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
+                        UCharacter.hasBinaryProperty(c, prop));
+                c = set.charAt(size - 1);
+                assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')',
+                        UCharacter.hasBinaryProperty(c, prop));
+                if (c < 0x10ffff) {
+                    assertFalse("!hasBinaryProperty(" + Utility.hex(c + 1) + ", " + prop + ')',
+                            UCharacter.hasBinaryProperty(c + 1, prop));
+                }
+            }
+        }
+    }
+
+    @Test
+    public void TestIntCharacterProperties() {
+        try {
+            CharacterProperties.getIntPropertyMap(UProperty.INT_START - 1);
+            fail("getIntPropertyMap(INT_START-1) did not throw an exception");
+            CharacterProperties.getIntPropertyMap(UProperty.INT_LIMIT);
+            fail("getIntPropertyMap(INT_LIMIT) did not throw an exception");
+        } catch(Exception expected) {
+        }
+        // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
+        CodePointMap.Range range = new CodePointMap.Range();
+        for (int prop = UProperty.INT_START; prop < UProperty.INT_LIMIT; ++prop) {
+            CodePointMap map = CharacterProperties.getIntPropertyMap(prop);
+            assertTrue("int property first range", map.getRange(0, null, range));
+            int c = (range.getStart() + range.getEnd()) / 2;
+            assertEquals("int property first range value at " + Utility.hex(c),
+                    UCharacter.getIntPropertyValue(c, prop), range.getValue());
+            assertTrue("int property later range", map.getRange(0x5000, null, range));
+            int end = range.getEnd();
+            assertEquals("int property later range value at " + Utility.hex(end),
+                    UCharacter.getIntPropertyValue(end, prop), range.getValue());
+        }
+    }
  }
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 7 Sep 2018 20:40:19 +0000 (13:40 -0700)
committer	Shane Carr <shane@unicode.org>
	Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
icu4c/source/common/Makefile.in		patch \| blob \| history
icu4c/source/common/characterproperties.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/common/common.vcxproj		patch \| blob \| history
icu4c/source/common/common.vcxproj.filters		patch \| blob \| history
icu4c/source/common/common_uwp.vcxproj		patch \| blob \| history
icu4c/source/common/mutex.h		patch \| blob \| history
icu4c/source/common/normalizer2impl.cpp		patch \| blob \| history
icu4c/source/common/normalizer2impl.h		patch \| blob \| history
icu4c/source/common/ucln_cmn.h		patch \| blob \| history
icu4c/source/common/ucptrie.cpp		patch \| blob \| history
icu4c/source/common/ucptrie_impl.h		patch \| blob \| history
icu4c/source/common/umutablecptrie.cpp		patch \| blob \| history
icu4c/source/common/unicode/uchar.h		patch \| blob \| history
icu4c/source/common/unicode/ucnv.h		patch \| blob \| history
icu4c/source/common/unicode/ucpmap.h	[new file with mode: 0644]	patch \| blob
icu4c/source/common/unicode/ucptrie.h		patch \| blob \| history
icu4c/source/common/unicode/umutablecptrie.h		patch \| blob \| history
icu4c/source/common/unicode/uniset.h		patch \| blob \| history
icu4c/source/common/unicode/uset.h		patch \| blob \| history
icu4c/source/common/uniset.cpp		patch \| blob \| history
icu4c/source/common/uniset_props.cpp		patch \| blob \| history
icu4c/source/common/uprops.cpp		patch \| blob \| history
icu4c/source/common/uprops.h		patch \| blob \| history
icu4c/source/test/cintltst/cucdtst.c		patch \| blob \| history
icu4c/source/test/cintltst/ucptrietest.c		patch \| blob \| history
icu4c/source/test/intltest/ucdtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/ucdtest.h		patch \| blob \| history
icu4c/source/tools/gennorm2/n2builder.cpp		patch \| blob \| history
icu4c/source/tools/gennorm2/norms.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java	[new file with mode: 0644]	patch \| blob
icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/lang/CharacterProperties.java	[new file with mode: 0644]	patch \| blob
icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java		patch \| blob \| history