ICU-12549 Revisions to uspoof.h documentation. Actually removing identifier_info...

author Shane Carr <shane@unicode.org>

Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)

committer Shane Carr <shane@unicode.org>

Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)
author Shane Carr <shane@unicode.org>
Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)
committer Shane Carr <shane@unicode.org>
Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)
diff --git a/icu4c/source/i18n/identifier_info.cpp b/icu4c/source/i18n/identifier_info.cpp

deleted file mode 100644 (file)

index 6118dcc..0000000
--- a/icu4c/source/i18n/identifier_info.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-**********************************************************************
-*   Copyright (C) 2012-2014, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-**********************************************************************
-*/
-
-#include "unicode/utypes.h"
-
-#include "unicode/uchar.h"
-#include "unicode/utf16.h"
-
-#include "identifier_info.h"
-#include "mutex.h"
-#include "scriptset.h"
-#include "ucln_in.h"
-#include "uvector.h"
-
-U_NAMESPACE_BEGIN
-
-static UnicodeSet *ASCII;
-static ScriptSet *JAPANESE;
-static ScriptSet *CHINESE;
-static ScriptSet *KOREAN;
-static ScriptSet *CONFUSABLE_WITH_LATIN;
-static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
-
-
-U_CDECL_BEGIN
-static UBool U_CALLCONV
-IdentifierInfo_cleanup(void) {
-    delete ASCII;
-    ASCII = NULL;
-    delete JAPANESE;
-    JAPANESE = NULL;
-    delete CHINESE;
-    CHINESE = NULL;
-    delete KOREAN;
-    KOREAN = NULL;
-    delete CONFUSABLE_WITH_LATIN;
-    CONFUSABLE_WITH_LATIN = NULL;
-    gIdentifierInfoInitOnce.reset(); 
-    return TRUE;
-}
-
-static void U_CALLCONV
-IdentifierInfo_init(UErrorCode &status) {
-    ASCII    = new UnicodeSet(0, 0x7f);
-    JAPANESE = new ScriptSet();
-    CHINESE  = new ScriptSet();
-    KOREAN   = new ScriptSet();
-    CONFUSABLE_WITH_LATIN = new ScriptSet();
-    if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 
-            || CONFUSABLE_WITH_LATIN == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-    ASCII->freeze();
-    JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
-             .set(USCRIPT_KATAKANA, status);
-    CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
-    KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
-    CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
-              .set(USCRIPT_CHEROKEE, status);
-    ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
-}
-U_CDECL_END
-
-
-IdentifierInfo::IdentifierInfo(UErrorCode &status):
-         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 
-         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
-    umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
-    if (U_FAILURE(status)) {
-        return;
-    }
-    
-    fIdentifier = new UnicodeString();
-    fRequiredScripts = new ScriptSet();
-    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
-    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
-    fCommonAmongAlternates = new ScriptSet();
-    fNumerics = new UnicodeSet();
-    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
-
-    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
-                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
-}
-
-IdentifierInfo::~IdentifierInfo() {
-    delete fIdentifier;
-    delete fRequiredScripts;
-    uhash_close(fScriptSetSet);
-    delete fCommonAmongAlternates;
-    delete fNumerics;
-    delete fIdentifierProfile;
-}
-
-
-IdentifierInfo &IdentifierInfo::clear() {
-    fRequiredScripts->resetAll();
-    uhash_removeAll(fScriptSetSet);
-    fNumerics->clear();
-    fCommonAmongAlternates->resetAll();
-    return *this;
-}
-
-
-IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
-    *fIdentifierProfile = identifierProfile;
-    return *this;
-}
-
-
-const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
-    return *fIdentifierProfile;
-}
-
-
-IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
-    if (U_FAILURE(status)) {
-        return *this;
-    }
-    *fIdentifier = identifier;
-    clear();
-    ScriptSet scriptsForCP;
-    UChar32 cp;
-    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
-        cp = identifier.char32At(i);
-        // Store a representative character for each kind of decimal digit
-        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
-            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
-            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
-        }
-        UScriptCode extensions[500];
-        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
-        if (U_FAILURE(status)) {
-            return *this;
-        }
-        scriptsForCP.resetAll();
-        for (int32_t j=0; j<extensionsCount; j++) {
-            scriptsForCP.set(extensions[j], status);
-        }
-        scriptsForCP.reset(USCRIPT_COMMON, status);
-        scriptsForCP.reset(USCRIPT_INHERITED, status);
-        switch (scriptsForCP.countMembers()) {
-          case 0: break;
-          case 1:
-            // Single script, record it.
-            fRequiredScripts->Union(scriptsForCP);
-            break;
-          default:
-            if (!fRequiredScripts->intersects(scriptsForCP) 
-                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
-                // If the set hasn't been added already, add it
-                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
-                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
-            }
-            break;
-        }
-    }
-    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
-    // [Kana], [Kana Hira] => [Kana]
-    // This is relatively infrequent, so doesn't have to be optimized.
-    // We also compute any commonalities among the alternates.
-    if (uhash_count(fScriptSetSet) > 0) {
-        fCommonAmongAlternates->setAll();
-        for (int32_t it = UHASH_FIRST;;) {
-            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
-            if (nextHashEl == NULL) {
-                break;
-            }
-            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
-            // [Kana], [Kana Hira] => [Kana]
-            if (fRequiredScripts->intersects(*next)) {
-                uhash_removeElement(fScriptSetSet, nextHashEl);
-            } else {
-                fCommonAmongAlternates->intersect(*next);
-                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
-                for (int32_t otherIt = UHASH_FIRST;;) {
-                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
-                    if (otherHashEl == NULL) {
-                        break;
-                    }
-                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
-                    if (next != other && next->contains(*other)) {
-                        uhash_removeElement(fScriptSetSet, nextHashEl);
-                        break;
-                    }
-                }
-            }
-        }
-    }
-    if (uhash_count(fScriptSetSet) == 0) {
-        fCommonAmongAlternates->resetAll();
-    }
-    return *this;
-}
-
-
-const UnicodeString *IdentifierInfo::getIdentifier() const {
-    return fIdentifier;
-}
-
-const ScriptSet *IdentifierInfo::getScripts() const {
-    return fRequiredScripts;
-}
-
-const UHashtable *IdentifierInfo::getAlternates() const {
-    return fScriptSetSet;
-}
-
-
-const UnicodeSet *IdentifierInfo::getNumerics() const {
-    return fNumerics;
-}
-
-const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
-    return fCommonAmongAlternates;
-}
-
-#if !UCONFIG_NO_NORMALIZATION
-
-URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
-    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
-        return USPOOF_UNRESTRICTIVE;
-    }
-    if (ASCII->containsAll(*fIdentifier)) {
-        return USPOOF_ASCII;
-    }
-    // This is a bit tricky. We look at a number of factors.
-    // The number of scripts in the text.
-    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
-    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
-
-    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
-    //       time it is created, in setIdentifier().
-    int32_t cardinalityPlus = fRequiredScripts->countMembers() + 
-            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
-    if (cardinalityPlus < 2) {
-        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
-    }
-    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
-            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
-        return USPOOF_HIGHLY_RESTRICTIVE;
-    }
-    if (cardinalityPlus == 2 && 
-            fRequiredScripts->test(USCRIPT_LATIN, status) && 
-            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
-        return USPOOF_MODERATELY_RESTRICTIVE;
-    }
-    return USPOOF_MINIMALLY_RESTRICTIVE;
-}
-
-#endif /* !UCONFIG_NO_NORMALIZATION */
-
-int32_t IdentifierInfo::getScriptCount() const {
-    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
-    int32_t count = fRequiredScripts->countMembers() +
-            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
-    return count;
-}
-    
-
-
-UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
-    if (!container.contains(containee)) {
-        return FALSE;
-    }
-    for (int32_t iter = UHASH_FIRST; ;) {
-        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
-        if (hashEl == NULL) {
-            break;
-        }
-        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
-        if (!container.intersects(*alternatives)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
-    UVector sorted(status);
-    if (U_FAILURE(status)) {
-        return dest;
-    }
-    for (int32_t pos = UHASH_FIRST; ;) {
-        const UHashElement *el = uhash_nextElement(alternates, &pos);
-        if (el == NULL) {
-            break;
-        }
-        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
-        sorted.addElement(ss, status);
-    }
-    sorted.sort(uhash_compareScriptSet, status);
-    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
-    for (int32_t i=0; i<sorted.size(); i++) {
-        if (i>0) {
-            dest.append(separator);
-        }
-        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
-        ss->displayScripts(dest);
-    }
-    return dest;
-}
-
-U_NAMESPACE_END
-
diff --git a/icu4c/source/i18n/identifier_info.h b/icu4c/source/i18n/identifier_info.h

deleted file mode 100644 (file)

index 8322203..0000000
--- a/icu4c/source/i18n/identifier_info.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-**********************************************************************
-*   Copyright (C) 2014, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-**********************************************************************
-*
-* indentifier_info.h
-* 
-* created on: 2013 Jan 7
-* created by: Andy Heninger
-*/
-
-#ifndef __IDENTIFIER_INFO_H__
-#define __IDENTIFIER_INFO_H__
-
-#include "unicode/utypes.h"
-
-#include "unicode/uniset.h"
-#include "unicode/uspoof.h"
-#include "uhash.h"
-
-U_NAMESPACE_BEGIN
-
-class ScriptSet;
-
-// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
-
-/**
- * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
- * then setIdentifier. Available methods include:
- * <ol>
- * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
- * each of these.
- * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
- * either Katakana or Hiragana.
- * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
- * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
- * the identifier.
- * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
- * </ol>
- * 
- * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
- */
-class U_I18N_API IdentifierInfo : public UMemory {
-
-  public:
-    /**
-     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
-     * @internal
-     */
-    IdentifierInfo(UErrorCode &status);
-
-    /**
-      * Destructor
-      */
-    virtual ~IdentifierInfo();
-
-  private:
-    /* Disallow copying for now. Can be added if there's a need. */
-    IdentifierInfo(const IdentifierInfo &other);
-
-  public:
-     
-    /**
-     * Set the identifier profile: the characters that are to be allowed in the identifier.
-     * 
-     * @param identifierProfile the characters that are to be allowed in the identifier
-     * @return this
-     * @internal
-     */
-    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
-
-    /**
-     * Get the identifier profile: the characters that are to be allowed in the identifier.
-     * 
-     * @return The characters that are to be allowed in the identifier.
-     * @internal
-     */
-    const UnicodeSet &getIdentifierProfile() const;
-
-
-    /**
-     * Set an identifier to analyze. Afterwards, call methods like getScripts()
-     * 
-     * @param identifier the identifier to analyze
-     * @param status Errorcode, set if errors occur.
-     * @return this
-     * @internal
-     */
-    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
-
-
-    /**
-     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
-     * and must not be deleted by the caller.
-     * 
-     * @return the identifier that was analyzed.
-     * @internal
-     */
-    const UnicodeString *getIdentifier() const;
-    
-
-    /**
-     * Get the scripts found in the identifiers.
-     * 
-     * @return the set of explicit scripts.
-     * @internal
-     */
-    const ScriptSet *getScripts() const;
-
-    /**
-     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
-     * the set consisting of those scripts will be returned.
-     * 
-     * @return a uhash, with each key being of type (ScriptSet *). 
-     *         This is a set, not a map, so the value stored in the uhash is not relevant.
-     *         (It is, in fact, 1).
-     *         Ownership of the uhash and its contents remains with the IndetifierInfo object, 
-     *         and remains valid until a new identifer is set or until the object is deleted.
-     * @internal
-     */
-    const UHashtable *getAlternates() const;
-
-    /**
-     * Get the representative characters (zeros) for the numerics found in the identifier.
-     * 
-     * @return the set of explicit scripts.
-     * @internal
-     */
-    const UnicodeSet *getNumerics() const;
-
-    /**
-     * Find out which scripts are in common among the alternates.
-     * 
-     * @return the set of scripts that are in common among the alternates.
-     * @internal
-     */
-    const ScriptSet *getCommonAmongAlternates() const;
-
-    /**
-      * Get the number of scripts appearing in the identifier.
-      *   Note: Common and Inherited scripts are omitted from the count.
-      *   Note: Result may be high when the identifier contains characters
-      *         with alternate scripts. The distinction between
-      *         0, 1 and > 1 will remain valid, however.
-      * @return the number of scripts.
-      */
-    int32_t getScriptCount() const;
-
-#if !UCONFIG_NO_NORMALIZATION
-
-    /**
-     * Find the "tightest" restriction level that the identifier satisfies.
-     * 
-     * @return the restriction level.
-     * @internal
-     */
-    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
-
-#endif /*!UCONFIG_NO_NORMALIZATION */
-
-    UnicodeString toString() const;
-
-    /**
-     * Produce a readable string of alternates.
-     * 
-     * @param alternates a UHashtable of UScriptSets.
-     *        Keys only, no meaningful values in the UHash.
-     * @return display form
-     * @internal
-     */
-    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
-
-  private:
-
-    IdentifierInfo  & clear();
-    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
-
-    UnicodeString     *fIdentifier;
-    ScriptSet         *fRequiredScripts;
-    UHashtable        *fScriptSetSet;
-    ScriptSet         *fCommonAmongAlternates;
-    UnicodeSet        *fNumerics;
-    UnicodeSet        *fIdentifierProfile;
-};
-
-U_NAMESPACE_END
-
-#endif // __IDENTIFIER_INFO_H__
-
diff --git a/icu4c/source/i18n/unicode/uspoof.h b/icu4c/source/i18n/unicode/uspoof.h

index 5151993ca9b67547d2e82d1c83cacf4f397ea484..996d98d95267e8054bc22f9ba90808ed66b195bc 100644 (file)
--- a/icu4c/source/i18n/unicode/uspoof.h
+++ b/icu4c/source/i18n/unicode/uspoof.h
@@ -42,10 +42,10 @@
   * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
   *
   * <ol>
- * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
- * "ԁеѕогԁепаԁо".</li>
+ * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
+ * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
   * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
- * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
+ * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
   * </ol>
   *
   * <p>
@@ -63,19 +63,25 @@
   *
   * \code{.c}
   * UErrorCode status = U_ZERO_ERROR;
+ * UChar* str1 = (UChar*) u"Harvest";
+ * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
+ *
   * USpoofChecker* sc = uspoof_open(&status);
   * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status);
- * UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0;
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status));  // areConfusable: 1 (success: 1)
+ *
+ * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
+ * UBool result = bitmask != 0;
+ * // areConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
   * uspoof_close(sc);
   * \endcode
   *
   * <p>
- * The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable
- * checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts
- * the result out of the confusability test. For best performance, the instance should be created once (e.g., upon
- * application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime.
+ * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
+ * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
+ * confusability test; and the following line extracts the result out of the return value. For best performance,
+ * the instance should be created once (e.g., upon application startup), and the efficient
+ * {@link uspoof_areConfusable} method can be used at runtime.
   *
   * <p>
   * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
@@ -95,27 +101,28 @@
   *
   * \code{.c}
   * UErrorCode status = U_ZERO_ERROR;
- * UChar* str1 = (UChar*) u"desordenado";
- * UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо";
+ * UChar* str1 = (UChar*) u"Harvest";
+ * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
   *
   * USpoofChecker* sc = uspoof_open(&status);
   * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
   *
   * // Get skeleton 1
   * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
- * UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar));
+ * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
   * status = U_ZERO_ERROR;
   * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
   *
   * // Get skeleton 2
   * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
- * UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar));
+ * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
   * status = U_ZERO_ERROR;
   * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
   *
   * // Are the skeletons the same?
- * UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0;
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status));  // areConfusable: 1 (success: 1)
+ * UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0;
+ * // areConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
   * uspoof_close(sc);
   * free(skel1);
   * free(skel2);
@@ -126,21 +133,19 @@
   * {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
   *
   * \code{.c}
- * // Setup:
   * UErrorCode status = U_ZERO_ERROR;
- * UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
- * UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)];
- * int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)];
+ * #define DICTIONARY_LENGTH 2
+ * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
+ * UChar* skeletons[DICTIONARY_LENGTH];
   * UChar* str = (UChar*) u"1orern";
   *
   * // Setup:
   * USpoofChecker* sc = uspoof_open(&status);
   * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
   *     UChar* word = dictionary[i];
   *     int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
- *     skeletons[i] = (UChar*) malloc(len * sizeof(UChar));
- *     skeletonLengths[i] = len;
+ *     skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
   *     status = U_ZERO_ERROR;
   *     uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
   * }
@@ -148,22 +153,20 @@
   * // Live Check:
   * {
   *     int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
- *     UChar* skel = (UChar*) malloc(len * sizeof(UChar));
+ *     UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
   *     status = U_ZERO_ERROR;
   *     uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
   *     UBool result = FALSE;
- *     for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
- *         if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) {
- *             result = TRUE;
- *         }
+ *     for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
+ *         result = u_strCompare(skel, -1, skeletons[i], -1, FALSE) == 0;
+ *         if (result == TRUE) { break; }
   *     }
- *     // Has confusable in dictionary: 1 (success: 1)
- *     printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status));
+ *     // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
+ *     printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
   *     free(skel);
   * }
   *
- * // Cleanup:
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
   *     free(skeletons[i]);
   * }
   * uspoof_close(sc);
@@ -182,7 +185,7 @@
   *
   * \code{.c}
   * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl";  // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
   *
   * // Get the default set of allowable characters:
   * USet* allowed = uset_openEmpty();
@@ -195,7 +198,8 @@
   *
   * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
   * UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status));  // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
+ * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
   * uspoof_close(sc);
   * uset_close(allowed);
   * \endcode
@@ -216,7 +220,7 @@
   *
   * \code{.c}
   * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl";  // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
   *
   * // Get the default set of allowable characters:
   * USet* allowed = uset_openEmpty();
@@ -233,8 +237,8 @@
   * int32_t failures1 = bitmask;
   * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
   * assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
   *
   * // Cleanup:
   * uspoof_close(sc);
@@ -247,7 +251,7 @@
   *
   * \code{.cpp}
   * UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl");  // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
   *
   * // Get the default set of allowable characters:
   * UnicodeSet allowed;
@@ -264,8 +268,8 @@
   * int32_t failures1 = bitmask;
   * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
   * assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
   *
   * // Explicit cleanup not necessary.
   * \endcode
@@ -291,14 +295,15 @@
   *
   * \code{.c}
   * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"৪8";
+ * UChar* str = (UChar*) u"8\u09EA";  // 8 mixed with U+09EA BENGALI DIGIT FOUR
   *
   * USpoofChecker* sc = uspoof_open(&status);
   * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
   *
   * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
   * UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status));  // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
+ * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
   * uspoof_close(sc);
   * \endcode
   *
@@ -307,7 +312,7 @@
   *
   * \code{.cpp}
   * UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl");  // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
   *
   * // Get the default set of allowable characters:
   * UnicodeSet allowed;
@@ -323,14 +328,14 @@
   * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
   *
   * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
- * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask:
+ * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
   * assert((restrictionLevel & bitmask) == restrictionLevel);
- * // Restriction level: 1342177280 (success: 1)
- * printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status));
+ * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
+ * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
   * \endcode
   *
   * <p>
- * The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
+ * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
   * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
   *
   * <p>
@@ -351,13 +356,13 @@
   * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
   *
   * <p>
- * <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether
+ * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
   * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
   * using the same USpoofChecker instance.
   *
   * <p>
   * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
- * thread safe. Those that take a non-const USpoofChecier are not thread safe..
+ * thread safe. Those that take a non-const USpoofChecker are not thread safe..
   *
   * @stable ICU 4.6
   */
@@ -419,13 +424,9 @@ typedef enum USpoofChecks {
       * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
       * make {@link uspoof_areConfusable} return only those types of confusables.
       *
-     * <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
-     * CONFUSABLE flags.
-     *
       * @see uspoof_areConfusable
       * @see uspoof_getSkeleton
       * @draft ICU 58
-     * @provisional This API might change or be removed in a future release.
       */
      USPOOF_CONFUSABLE               =   USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
  
@@ -471,7 +472,7 @@ typedef enum USpoofChecks {
      USPOOF_INVISIBLE                =  32,
  
      /** Check that an identifier contains only characters from a specified set
-      * of acceptable characters.  See {@link uspoof_setAllowedChars}
+      * of acceptable characters.  See {@link uspoof_setAllowedChars} and
        * {@link uspoof_setAllowedLocales}.  Note that a string that fails this check
        * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
        */
@@ -750,14 +751,16 @@ U_STABLE int32_t U_EXPORT2
  uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
  
  /**
-  * Set the loosest restriction level allowed for strings. The default if this is not called is
-  * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
-  * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
-  * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
-  * @param restrictionLevel The loosest restriction level allowed.
-  * @see URestrictionLevel
-  * @stable ICU 51
-  */
+ * Set the loosest restriction level allowed for strings. The default if this is not called is
+ * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
+ * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
+ * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
+ *
+ * @param sc       The USpoofChecker
+ * @param restrictionLevel The loosest restriction level allowed.
+ * @see URestrictionLevel
+ * @stable ICU 51
+ */
  U_STABLE void U_EXPORT2
  uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
  
@@ -1059,6 +1062,8 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
   * @param sc      The USpoofChecker
   * @param id      The identifier to be checked for possible security issues,
   *                in UTF-16 format.
+ * @param length  the length of the string to be checked, or -1 if the string is
+ *                zero terminated.
   * @param checkResult  An instance of USpoofCheckResult to be filled with
   *                details about the identifier.  Can be NULL.
   * @param status  The error code, set if an error occurred while attempting to
@@ -1259,7 +1264,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
   *
   * <ul>
   *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
- *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li>
+ *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
   *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
   * </ul>
   *
diff --git a/icu4c/source/i18n/uspoof_impl.cpp b/icu4c/source/i18n/uspoof_impl.cpp

index 70026697f07a16df16076e7843f2917ac4b13822..a7ce8ee2601fee3ecadfb9a3eb41ed51f9bf778f 100644 (file)
--- a/icu4c/source/i18n/uspoof_impl.cpp
+++ b/icu4c/source/i18n/uspoof_impl.cpp
@@ -62,13 +62,13 @@ void SpoofImpl::construct(UErrorCode& status) {
      if (U_FAILURE(status)) { return; }
  
      UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
-    allowedCharsSet->freeze();
      fAllowedCharsSet = allowedCharsSet;
      fAllowedLocales  = uprv_strdup("");
      if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
+    allowedCharsSet->freeze();
  }
  
  
@@ -85,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
          fSpoofData = src.fSpoofData->addReference();
      }
      fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
-    if (fAllowedCharsSet == NULL) {
+    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
+    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
      }
-    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
      fRestrictionLevel = src.fRestrictionLevel;
  }
  
diff --git a/icu4c/source/i18n/uspoof_impl.h b/icu4c/source/i18n/uspoof_impl.h

index aa95dbcb541ac0af762888806ef3615b5b84963d..c6e5c938e5aa5dd61dc724d8308d304632de401d 100644 (file)
--- a/icu4c/source/i18n/uspoof_impl.h
+++ b/icu4c/source/i18n/uspoof_impl.h
@@ -123,7 +123,7 @@ public:
      // Used to convert this CheckResult to the older int32_t return value API
      int32_t toCombinedBitmask(int32_t expectedChecks);
  
-    // Data Members (all stack-allocated)
+    // Data Members
      int32_t fMagic;                        // Internal sanity check.
      int32_t fChecks;                       // Bit vector of checks that were failed.
      UnicodeSet fNumerics;                  // Set of numerics found in the string.
diff --git a/icu4c/source/i18n/uspoof_wsconf.cpp b/icu4c/source/i18n/uspoof_wsconf.cpp

deleted file mode 100644 (file)

index 477a3b7..0000000
--- a/icu4c/source/i18n/uspoof_wsconf.cpp
+++ /dev/null
@@ -1,438 +0,0 @@
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-*   Copyright (C) 2008-2013, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-*
-******************************************************************************
-*   file name:  uspoof_wsconf.cpp
-*   encoding:   US-ASCII
-*   tab size:   8 (not used)
-*   indentation:4
-*
-*   created on: 2009Jan05  (refactoring earlier files)
-*   created by: Andy Heninger
-*
-*   Internal functions for compililing Whole Script confusable source data
-*   into its binary (runtime) form.  The binary data format is described
-*   in uspoof_impl.h
-*/
-
-#include "unicode/utypes.h"
-#include "unicode/uspoof.h"
-
-#if !UCONFIG_NO_NORMALIZATION
-
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS 
-
-#include "unicode/unorm.h"
-#include "unicode/uregex.h"
-#include "unicode/ustring.h"
-#include "cmemory.h"
-#include "scriptset.h"
-#include "uspoof_impl.h"
-#include "uhash.h"
-#include "uvector.h"
-#include "uassert.h"
-#include "uspoof_wsconf.h"
-
-U_NAMESPACE_USE
-
-
-// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
-// Example Lines:
-//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
-//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
-//    |               |     |    |
-//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
-//    |               |     |----------Target script.   We need this.
-//    |               |----------------Src script.  Should match the script of the source
-//    |                                code points.  Beyond checking that, we don't keep it.
-//    |--------------------------------Source code points or range.
-//
-// The expression will match _all_ lines, including erroneous lines.
-// The result of the parse is returned via the contents of the (match) groups.
-static const char *parseExp = 
-        "(?m)"                                         // Multi-line mode
-        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
-        "|^(?:"                                        //   OR
-        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
-        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
-        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
-        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
-        "[ \\t]*(?:#.*?)?"                             // Trailing commment
-        ")$|"                                          //   OR
-        "^(.*?)$";                                     // An error line.      Group 8.
-                                                       //    Any line not matching the preceding
-                                                       //    parts of the expression.will match
-                                                       //    this, and thus be flagged as an error
-
-
-// Extract a regular expression match group into a char * string.
-//    The group must contain only invariant characters.
-//    Used for script names
-// 
-static void extractGroup(
-    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
-
-    UChar ubuf[50];
-    ubuf[0] = 0;
-    destBuf[0] = 0;
-    int32_t len = uregex_group(e, group, ubuf, 50, &status);
-    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
-        return;
-    }
-    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
-    s.extract(0, len, destBuf, destCapacity, US_INV);
-}
-
-
-
-U_NAMESPACE_BEGIN
-
-//  Build the Whole Script Confusable data
-//
-//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
-//                         because everything is local to this one build function anyhow,
-//                           OR
-//                         break this function into more reasonably sized pieces, with
-//                         state in WSConfusableDataBuilder.
-//
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
-          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
-{
-    if (U_FAILURE(status)) {
-        return;
-    }
-    URegularExpression *parseRegexp = NULL;
-    int32_t             inputLen    = 0;
-    UChar              *input       = NULL;
-    int32_t             lineNum     = 0;
-    
-    UVector            *scriptSets        = NULL;
-    uint32_t            rtScriptSetsCount = 2;
-
-    UTrie2             *anyCaseTrie   = NULL;
-    UTrie2             *lowerCaseTrie = NULL;
-
-    anyCaseTrie = utrie2_open(0, 0, &status);
-    lowerCaseTrie = utrie2_open(0, 0, &status);
-
-    UnicodeString pattern(parseExp, -1, US_INV);
-
-    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
-    //
-    // Reserved TRIE values:
-    //   0:  Code point has no whole script confusables.
-    //   1:  Code point is of script Common or Inherited.
-    //       These code points do not participate in whole script confusable detection.
-    //       (This is logically equivalent to saying that they contain confusables in
-    //        all scripts)
-    //
-    // Because Trie values are indexes into the ScriptSets vector, pre-fill
-    // vector positions 0 and 1 to avoid conflicts with the reserved values.
-    
-    scriptSets = new UVector(status);
-    if (scriptSets == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        goto cleanup;
-    }
-    scriptSets->addElement((void *)NULL, status);
-    scriptSets->addElement((void *)NULL, status);
-
-    // Convert the user input data from UTF-8 to UChar (UTF-16)
-    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
-    if (status != U_BUFFER_OVERFLOW_ERROR) {
-        goto cleanup;
-    }
-    status = U_ZERO_ERROR;
-    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
-    if (input == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        goto cleanup;
-    }
-    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
-
-    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
-
-    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
-    //   given the syntax of the input.
-    if (*input == 0xfeff) {
-        *input = 0x20;
-    }
-
-    // Parse the input, one line per iteration of this loop.
-    uregex_setText(parseRegexp, input, inputLen, &status);
-    while (uregex_findNext(parseRegexp, &status)) {
-        lineNum++;
-        if (uregex_start(parseRegexp, 1, &status) >= 0) {
-            // this was a blank or comment line.
-            continue;
-        }
-        if (uregex_start(parseRegexp, 8, &status) >= 0) {
-            // input file syntax error.
-            status = U_PARSE_ERROR;
-            goto cleanup;
-        }
-        if (U_FAILURE(status)) {
-            goto cleanup;
-        }
-
-        // Pick up the start and optional range end code points from the parsed line.
-        UChar32  startCodePoint = SpoofImpl::ScanHex(
-            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
-        UChar32  endCodePoint = startCodePoint;
-        if (uregex_start(parseRegexp, 3, &status) >=0) {
-            endCodePoint = SpoofImpl::ScanHex(
-                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
-        }
-
-        // Extract the two script names from the source line.  We need these in an 8 bit
-        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
-        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
-        char  srcScriptName[20];
-        char  targScriptName[20];
-        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
-        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
-        UScriptCode srcScript  =
-            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
-        UScriptCode targScript =
-            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
-        if (U_FAILURE(status)) {
-            goto cleanup;
-        }
-        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
-            status = U_INVALID_FORMAT_ERROR;
-            goto cleanup;
-        }
-
-        // select the table - (A) any case or (L) lower case only
-        UTrie2 *table = anyCaseTrie;
-        if (uregex_start(parseRegexp, 7, &status) >= 0) {
-            table = lowerCaseTrie;
-        }
-
-        // Build the set of scripts containing confusable characters for
-        //   the code point(s) specified in this input line.
-        // Sanity check that the script of the source code point is the same
-        //   as the source script indicated in the input file.  Failure of this check is
-        //   an error in the input file.
-        // Include the source script in the set (needed for Mixed Script Confusable detection).
-        //
-        UChar32 cp;
-        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
-            int32_t setIndex = utrie2_get32(table, cp);
-            BuilderScriptSet *bsset = NULL;
-            if (setIndex > 0) {
-                U_ASSERT(setIndex < scriptSets->size());
-                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
-            } else {
-                bsset = new BuilderScriptSet();
-                if (bsset == NULL) {
-                    status = U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-                bsset->codePoint = cp;
-                bsset->trie = table;
-                bsset->sset = new ScriptSet();
-                setIndex = scriptSets->size();
-                bsset->index = setIndex;
-                bsset->rindex = 0;
-                if (bsset->sset == NULL) {
-                    status = U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-                scriptSets->addElement(bsset, status);
-                utrie2_set32(table, cp, setIndex, &status);
-            }
-            bsset->sset->set(targScript, status);
-            bsset->sset->set(srcScript, status);
-
-            if (U_FAILURE(status)) {
-                goto cleanup;
-            }
-            UScriptCode cpScript = uscript_getScript(cp, &status);
-            if (cpScript != srcScript) {
-                status = U_INVALID_FORMAT_ERROR;
-                goto cleanup;
-            }
-        }
-    }
-
-    // Eliminate duplicate script sets.  At this point we have a separate
-    // script set for every code point that had data in the input file.
-    //
-    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
-    //
-    // printf("Number of scriptSets: %d\n", scriptSets->size());
-    {
-        int32_t duplicateCount = 0;
-        rtScriptSetsCount = 2;
-        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
-            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
-            if (outerSet->index != static_cast<uint32_t>(outeri)) {
-                // This set was already identified as a duplicate.
-                //   It will not be allocated a position in the runtime array of ScriptSets.
-                continue;
-            }
-            outerSet->rindex = rtScriptSetsCount++;
-            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
-                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
-                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
-                    delete innerSet->sset;
-                    innerSet->scriptSetOwned = FALSE;
-                    innerSet->sset = outerSet->sset;
-                    innerSet->index = outeri;
-                    innerSet->rindex = outerSet->rindex;
-                    duplicateCount++;
-                }
-                // But this doesn't get all.  We need to fix the TRIE.
-            }
-        }
-        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
-    }
-
-    
-
-    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
-    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
-    //     are unused, which is why the loop index starts at 2.)
-    {
-        for (int32_t i=2; i<scriptSets->size(); i++) {
-            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
-            if (bSet->rindex != (uint32_t)i) {
-                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
-            }
-        }
-    }
-
-    // For code points with script==Common or script==Inherited,
-    //   Set the reserved value of 1 into both Tries.  These characters do not participate
-    //   in Whole Script Confusable detection; this reserved value is the means
-    //   by which they are detected.
-    {
-        UnicodeSet ignoreSet;
-        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
-        UnicodeSet inheritedSet;
-        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
-        ignoreSet.addAll(inheritedSet);
-        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
-            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
-            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
-            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
-            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
-        }
-    }
-
-    // Serialize the data to the Spoof Detector
-    {
-        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
-        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
-        // printf("Any case Trie size: %d\n", size);
-        if (status != U_BUFFER_OVERFLOW_ERROR) {
-            goto cleanup;
-        }
-        status = U_ZERO_ERROR;
-        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
-        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
-        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
-        void *where = spImpl->fSpoofData->reserveSpace(size, status);
-        utrie2_serialize(anyCaseTrie, where, size, &status);
-        
-        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
-        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
-        // printf("Lower case Trie size: %d\n", size);
-        if (status != U_BUFFER_OVERFLOW_ERROR) {
-            goto cleanup;
-        }
-        status = U_ZERO_ERROR;
-        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
-        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
-        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
-        where = spImpl->fSpoofData->reserveSpace(size, status);
-        utrie2_serialize(lowerCaseTrie, where, size, &status);
-
-        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
-        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
-        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
-            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
-        uint32_t rindex = 2;
-        for (int32_t i=2; i<scriptSets->size(); i++) {
-            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
-            if (bSet->rindex < rindex) {
-                // We have already copied this script set to the serialized data.
-                continue;
-            }
-            U_ASSERT(rindex == bSet->rindex);
-            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
-            rindex++;
-        }
-    }
-
-    // Open new utrie2s from the serialized data.  We don't want to keep the ones
-    //   we just built because we would then have two copies of the data, one internal to
-    //   the utries that we have already constructed, and one in the serialized data area.
-    //   An alternative would be to not pre-serialize the Trie data, but that makes the
-    //   spoof detector data different, depending on how the detector was constructed.
-    //   It's simpler to keep the data always the same.
-    
-    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
-            UTRIE2_16_VALUE_BITS,
-            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
-            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
-            NULL,
-            &status);
-
-    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
-            UTRIE2_16_VALUE_BITS,
-            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
-            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
-            NULL,
-            &status);
-
-    
-
-cleanup:
-    if (U_FAILURE(status)) {
-        pe->line = lineNum;
-    }
-    uregex_close(parseRegexp);
-    uprv_free(input);
-
-    int32_t i;
-    if (scriptSets != NULL) {
-        for (i=0; i<scriptSets->size(); i++) {
-            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
-            delete bsset;
-        }
-        delete scriptSets;
-    }
-    utrie2_close(anyCaseTrie);
-    utrie2_close(lowerCaseTrie);
-    return;
-}
-
-U_NAMESPACE_END
-
-
-
-BuilderScriptSet::BuilderScriptSet() {
-    codePoint = -1;
-    trie = NULL;
-    sset = NULL;
-    index = 0;
-    rindex = 0;
-    scriptSetOwned = TRUE;
-}
-
-BuilderScriptSet::~BuilderScriptSet() {
-    if (scriptSetOwned) {
-        delete sset;
-    }
-}
-
-#endif
-#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS 
-
diff --git a/icu4c/source/i18n/uspoof_wsconf.h b/icu4c/source/i18n/uspoof_wsconf.h

deleted file mode 100644 (file)

index 4ef0c0f..0000000
--- a/icu4c/source/i18n/uspoof_wsconf.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-*   Copyright (C) 2008-2012, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-*
-******************************************************************************
-*   file name:  uspoof_buildwsconf.h
-*   encoding:   US-ASCII
-*   tab size:   8 (not used)
-*   indentation:4
-*
-*   created on: 2009Jan19
-*   created by: Andy Heninger
-*
-*   Internal classes and functions
-*   for compiling whole script confusable data into its binary (runtime) form.
-*/
-
-#ifndef __USPOOF_BUILDWSCONF_H__
-#define __USPOOF_BUILDWSCONF_H__
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_NORMALIZATION
-
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS 
-
-#include "uspoof_impl.h"
-#include "utrie2.h"
-
-
-U_NAMESPACE_BEGIN
-
-//
-// class BuilderScriptSet.   Represents the set of scripts (Script Codes)
-//             containing characters that are confusable with one specific
-//             code point.
-//
-
-class BuilderScriptSet: public UMemory {
-  public:
-    UChar32      codePoint;       // The source code point.
-    UTrie2      *trie;            // Any-case or Lower-case Trie.
-                                  //   These Trie tables are the final result of the
-                                  //   build.  This flag indicates which of the two
-                                  //   this set of data is for.
-    ScriptSet   *sset;            // The set of scripts itself.
-
-                                  // Vectors of all B
-    uint32_t     index;           // Index of this set in the Build Time vector
-                                  //   of script sets.
-    uint32_t     rindex;          // Index of this set in the final (runtime)
-                                  //   array of sets.
-    UBool        scriptSetOwned;  // True if this BuilderScriptSet owns (should delete)
-                                  //   its underlying sset.
-
-    BuilderScriptSet();
-    ~BuilderScriptSet();
-};
-
-
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
-          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status); 
-
-U_NAMESPACE_END
-
-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 
-#endif // !UCONFIG_NO_NORMALIZATION 
-#endif
diff --git a/icu4c/source/test/cintltst/spooftest.c b/icu4c/source/test/cintltst/spooftest.c

index 6a82372ab7a1fb16b197265dfbabd66924c10d14..f012db0f91912e5ca20057a7eef3a9218b195c73 100644 (file)
--- a/icu4c/source/test/cintltst/spooftest.c
+++ b/icu4c/source/test/cintltst/spooftest.c
@@ -478,7 +478,7 @@ static void TestUSpoofCAPI(void) {
          const UChar* tests[] = { goodLatin, scMixed, scLatin,
                  goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
  
-        for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
+        for (int32_t i=0; i<UPRV_LENGTHOF(tests); i++) {
              const UChar* str = tests[i];
  
              // Basic test
author	Shane Carr <shane@unicode.org>
	Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)
committer	Shane Carr <shane@unicode.org>
	Tue, 20 Sep 2016 21:06:55 +0000 (21:06 +0000)
icu4c/source/i18n/identifier_info.cpp	[deleted file]	patch \| blob \| history
icu4c/source/i18n/identifier_info.h	[deleted file]	patch \| blob \| history
icu4c/source/i18n/unicode/uspoof.h		patch \| blob \| history
icu4c/source/i18n/uspoof_impl.cpp		patch \| blob \| history
icu4c/source/i18n/uspoof_impl.h		patch \| blob \| history
icu4c/source/i18n/uspoof_wsconf.cpp	[deleted file]	patch \| blob \| history
icu4c/source/i18n/uspoof_wsconf.h	[deleted file]	patch \| blob \| history
icu4c/source/test/cintltst/spooftest.c		patch \| blob \| history