+++ /dev/null
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-**********************************************************************
-* Copyright (C) 2012-2014, International Business Machines
-* Corporation and others. All Rights Reserved.
-**********************************************************************
-*/
-
-#include "unicode/utypes.h"
-
-#include "unicode/uchar.h"
-#include "unicode/utf16.h"
-
-#include "identifier_info.h"
-#include "mutex.h"
-#include "scriptset.h"
-#include "ucln_in.h"
-#include "uvector.h"
-
-U_NAMESPACE_BEGIN
-
-static UnicodeSet *ASCII;
-static ScriptSet *JAPANESE;
-static ScriptSet *CHINESE;
-static ScriptSet *KOREAN;
-static ScriptSet *CONFUSABLE_WITH_LATIN;
-static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
-
-
-U_CDECL_BEGIN
-static UBool U_CALLCONV
-IdentifierInfo_cleanup(void) {
- delete ASCII;
- ASCII = NULL;
- delete JAPANESE;
- JAPANESE = NULL;
- delete CHINESE;
- CHINESE = NULL;
- delete KOREAN;
- KOREAN = NULL;
- delete CONFUSABLE_WITH_LATIN;
- CONFUSABLE_WITH_LATIN = NULL;
- gIdentifierInfoInitOnce.reset();
- return TRUE;
-}
-
-static void U_CALLCONV
-IdentifierInfo_init(UErrorCode &status) {
- ASCII = new UnicodeSet(0, 0x7f);
- JAPANESE = new ScriptSet();
- CHINESE = new ScriptSet();
- KOREAN = new ScriptSet();
- CONFUSABLE_WITH_LATIN = new ScriptSet();
- if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
- || CONFUSABLE_WITH_LATIN == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ASCII->freeze();
- JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
- .set(USCRIPT_KATAKANA, status);
- CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
- KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
- CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
- .set(USCRIPT_CHEROKEE, status);
- ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
-}
-U_CDECL_END
-
-
-IdentifierInfo::IdentifierInfo(UErrorCode &status):
- fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
- fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
- umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
- if (U_FAILURE(status)) {
- return;
- }
-
- fIdentifier = new UnicodeString();
- fRequiredScripts = new ScriptSet();
- fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
- uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
- fCommonAmongAlternates = new ScriptSet();
- fNumerics = new UnicodeSet();
- fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
-
- if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
- fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
-}
-
-IdentifierInfo::~IdentifierInfo() {
- delete fIdentifier;
- delete fRequiredScripts;
- uhash_close(fScriptSetSet);
- delete fCommonAmongAlternates;
- delete fNumerics;
- delete fIdentifierProfile;
-}
-
-
-IdentifierInfo &IdentifierInfo::clear() {
- fRequiredScripts->resetAll();
- uhash_removeAll(fScriptSetSet);
- fNumerics->clear();
- fCommonAmongAlternates->resetAll();
- return *this;
-}
-
-
-IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
- *fIdentifierProfile = identifierProfile;
- return *this;
-}
-
-
-const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
- return *fIdentifierProfile;
-}
-
-
-IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
- if (U_FAILURE(status)) {
- return *this;
- }
- *fIdentifier = identifier;
- clear();
- ScriptSet scriptsForCP;
- UChar32 cp;
- for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
- cp = identifier.char32At(i);
- // Store a representative character for each kind of decimal digit
- if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
- // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
- fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
- }
- UScriptCode extensions[500];
- int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
- if (U_FAILURE(status)) {
- return *this;
- }
- scriptsForCP.resetAll();
- for (int32_t j=0; j<extensionsCount; j++) {
- scriptsForCP.set(extensions[j], status);
- }
- scriptsForCP.reset(USCRIPT_COMMON, status);
- scriptsForCP.reset(USCRIPT_INHERITED, status);
- switch (scriptsForCP.countMembers()) {
- case 0: break;
- case 1:
- // Single script, record it.
- fRequiredScripts->Union(scriptsForCP);
- break;
- default:
- if (!fRequiredScripts->intersects(scriptsForCP)
- && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
- // If the set hasn't been added already, add it
- // (Add a copy, fScriptSetSet takes ownership of the copy.)
- uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
- }
- break;
- }
- }
- // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
- // [Kana], [Kana Hira] => [Kana]
- // This is relatively infrequent, so doesn't have to be optimized.
- // We also compute any commonalities among the alternates.
- if (uhash_count(fScriptSetSet) > 0) {
- fCommonAmongAlternates->setAll();
- for (int32_t it = UHASH_FIRST;;) {
- const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
- if (nextHashEl == NULL) {
- break;
- }
- ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
- // [Kana], [Kana Hira] => [Kana]
- if (fRequiredScripts->intersects(*next)) {
- uhash_removeElement(fScriptSetSet, nextHashEl);
- } else {
- fCommonAmongAlternates->intersect(*next);
- // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
- for (int32_t otherIt = UHASH_FIRST;;) {
- const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
- if (otherHashEl == NULL) {
- break;
- }
- ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
- if (next != other && next->contains(*other)) {
- uhash_removeElement(fScriptSetSet, nextHashEl);
- break;
- }
- }
- }
- }
- }
- if (uhash_count(fScriptSetSet) == 0) {
- fCommonAmongAlternates->resetAll();
- }
- return *this;
-}
-
-
-const UnicodeString *IdentifierInfo::getIdentifier() const {
- return fIdentifier;
-}
-
-const ScriptSet *IdentifierInfo::getScripts() const {
- return fRequiredScripts;
-}
-
-const UHashtable *IdentifierInfo::getAlternates() const {
- return fScriptSetSet;
-}
-
-
-const UnicodeSet *IdentifierInfo::getNumerics() const {
- return fNumerics;
-}
-
-const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
- return fCommonAmongAlternates;
-}
-
-#if !UCONFIG_NO_NORMALIZATION
-
-URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
- if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
- return USPOOF_UNRESTRICTIVE;
- }
- if (ASCII->containsAll(*fIdentifier)) {
- return USPOOF_ASCII;
- }
- // This is a bit tricky. We look at a number of factors.
- // The number of scripts in the text.
- // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
- // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
-
- // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
- // time it is created, in setIdentifier().
- int32_t cardinalityPlus = fRequiredScripts->countMembers() +
- (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
- if (cardinalityPlus < 2) {
- return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
- }
- if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
- || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
- return USPOOF_HIGHLY_RESTRICTIVE;
- }
- if (cardinalityPlus == 2 &&
- fRequiredScripts->test(USCRIPT_LATIN, status) &&
- !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
- return USPOOF_MODERATELY_RESTRICTIVE;
- }
- return USPOOF_MINIMALLY_RESTRICTIVE;
-}
-
-#endif /* !UCONFIG_NO_NORMALIZATION */
-
-int32_t IdentifierInfo::getScriptCount() const {
- // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
- int32_t count = fRequiredScripts->countMembers() +
- (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
- return count;
-}
-
-
-
-UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
- if (!container.contains(containee)) {
- return FALSE;
- }
- for (int32_t iter = UHASH_FIRST; ;) {
- const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
- if (hashEl == NULL) {
- break;
- }
- ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
- if (!container.intersects(*alternatives)) {
- return false;
- }
- }
- return true;
-}
-
-UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
- UVector sorted(status);
- if (U_FAILURE(status)) {
- return dest;
- }
- for (int32_t pos = UHASH_FIRST; ;) {
- const UHashElement *el = uhash_nextElement(alternates, &pos);
- if (el == NULL) {
- break;
- }
- ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
- sorted.addElement(ss, status);
- }
- sorted.sort(uhash_compareScriptSet, status);
- UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
- for (int32_t i=0; i<sorted.size(); i++) {
- if (i>0) {
- dest.append(separator);
- }
- ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
- ss->displayScripts(dest);
- }
- return dest;
-}
-
-U_NAMESPACE_END
-
+++ /dev/null
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-**********************************************************************
-* Copyright (C) 2014, International Business Machines
-* Corporation and others. All Rights Reserved.
-**********************************************************************
-*
-* indentifier_info.h
-*
-* created on: 2013 Jan 7
-* created by: Andy Heninger
-*/
-
-#ifndef __IDENTIFIER_INFO_H__
-#define __IDENTIFIER_INFO_H__
-
-#include "unicode/utypes.h"
-
-#include "unicode/uniset.h"
-#include "unicode/uspoof.h"
-#include "uhash.h"
-
-U_NAMESPACE_BEGIN
-
-class ScriptSet;
-
-// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
-
-/**
- * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
- * then setIdentifier. Available methods include:
- * <ol>
- * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
- * each of these.
- * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
- * either Katakana or Hiragana.
- * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
- * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
- * the identifier.
- * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
- * </ol>
- *
- * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
- */
-class U_I18N_API IdentifierInfo : public UMemory {
-
- public:
- /**
- * Create an identifier info object. Subsequently, call setIdentifier(), etc.
- * @internal
- */
- IdentifierInfo(UErrorCode &status);
-
- /**
- * Destructor
- */
- virtual ~IdentifierInfo();
-
- private:
- /* Disallow copying for now. Can be added if there's a need. */
- IdentifierInfo(const IdentifierInfo &other);
-
- public:
-
- /**
- * Set the identifier profile: the characters that are to be allowed in the identifier.
- *
- * @param identifierProfile the characters that are to be allowed in the identifier
- * @return this
- * @internal
- */
- IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
-
- /**
- * Get the identifier profile: the characters that are to be allowed in the identifier.
- *
- * @return The characters that are to be allowed in the identifier.
- * @internal
- */
- const UnicodeSet &getIdentifierProfile() const;
-
-
- /**
- * Set an identifier to analyze. Afterwards, call methods like getScripts()
- *
- * @param identifier the identifier to analyze
- * @param status Errorcode, set if errors occur.
- * @return this
- * @internal
- */
- IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
-
-
- /**
- * Get the identifier that was analyzed. The returned string is owned by the ICU library,
- * and must not be deleted by the caller.
- *
- * @return the identifier that was analyzed.
- * @internal
- */
- const UnicodeString *getIdentifier() const;
-
-
- /**
- * Get the scripts found in the identifiers.
- *
- * @return the set of explicit scripts.
- * @internal
- */
- const ScriptSet *getScripts() const;
-
- /**
- * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
- * the set consisting of those scripts will be returned.
- *
- * @return a uhash, with each key being of type (ScriptSet *).
- * This is a set, not a map, so the value stored in the uhash is not relevant.
- * (It is, in fact, 1).
- * Ownership of the uhash and its contents remains with the IndetifierInfo object,
- * and remains valid until a new identifer is set or until the object is deleted.
- * @internal
- */
- const UHashtable *getAlternates() const;
-
- /**
- * Get the representative characters (zeros) for the numerics found in the identifier.
- *
- * @return the set of explicit scripts.
- * @internal
- */
- const UnicodeSet *getNumerics() const;
-
- /**
- * Find out which scripts are in common among the alternates.
- *
- * @return the set of scripts that are in common among the alternates.
- * @internal
- */
- const ScriptSet *getCommonAmongAlternates() const;
-
- /**
- * Get the number of scripts appearing in the identifier.
- * Note: Common and Inherited scripts are omitted from the count.
- * Note: Result may be high when the identifier contains characters
- * with alternate scripts. The distinction between
- * 0, 1 and > 1 will remain valid, however.
- * @return the number of scripts.
- */
- int32_t getScriptCount() const;
-
-#if !UCONFIG_NO_NORMALIZATION
-
- /**
- * Find the "tightest" restriction level that the identifier satisfies.
- *
- * @return the restriction level.
- * @internal
- */
- URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
-
-#endif /*!UCONFIG_NO_NORMALIZATION */
-
- UnicodeString toString() const;
-
- /**
- * Produce a readable string of alternates.
- *
- * @param alternates a UHashtable of UScriptSets.
- * Keys only, no meaningful values in the UHash.
- * @return display form
- * @internal
- */
- static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
-
- private:
-
- IdentifierInfo & clear();
- UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
-
- UnicodeString *fIdentifier;
- ScriptSet *fRequiredScripts;
- UHashtable *fScriptSetSet;
- ScriptSet *fCommonAmongAlternates;
- UnicodeSet *fNumerics;
- UnicodeSet *fIdentifierProfile;
-};
-
-U_NAMESPACE_END
-
-#endif // __IDENTIFIER_INFO_H__
-
* <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
*
* <ol>
- * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
- * "ԁеѕогԁепаԁо".</li>
+ * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
+ * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li>
* <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
- * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
+ * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
* </ol>
*
* <p>
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
+ * UChar* str1 = (UChar*) u"Harvest";
+ * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
+ *
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status);
- * UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0;
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
+ *
+ * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
+ * UBool result = bitmask != 0;
+ * // areConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* \endcode
*
* <p>
- * The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable
- * checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts
- * the result out of the confusability test. For best performance, the instance should be created once (e.g., upon
- * application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime.
+ * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
+ * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
+ * confusability test; and the following line extracts the result out of the return value. For best performance,
+ * the instance should be created once (e.g., upon application startup), and the efficient
+ * {@link uspoof_areConfusable} method can be used at runtime.
*
* <p>
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
- * UChar* str1 = (UChar*) u"desordenado";
- * UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо";
+ * UChar* str1 = (UChar*) u"Harvest";
+ * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
*
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
*
* // Get skeleton 1
* int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
- * UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar));
+ * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
* status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
*
* // Get skeleton 2
* int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
- * UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar));
+ * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
* status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
*
* // Are the skeletons the same?
- * UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0;
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status)); // areConfusable: 1 (success: 1)
+ * UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0;
+ * // areConfusable: 1 (status: U_ZERO_ERROR)
+ * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* free(skel1);
* free(skel2);
* {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
*
* \code{.c}
- * // Setup:
* UErrorCode status = U_ZERO_ERROR;
- * UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
- * UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)];
- * int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)];
+ * #define DICTIONARY_LENGTH 2
+ * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
+ * UChar* skeletons[DICTIONARY_LENGTH];
* UChar* str = (UChar*) u"1orern";
*
* // Setup:
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
* UChar* word = dictionary[i];
* int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
- * skeletons[i] = (UChar*) malloc(len * sizeof(UChar));
- * skeletonLengths[i] = len;
+ * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
* status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
* }
* // Live Check:
* {
* int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
- * UChar* skel = (UChar*) malloc(len * sizeof(UChar));
+ * UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
* status = U_ZERO_ERROR;
* uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
* UBool result = FALSE;
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
- * if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) {
- * result = TRUE;
- * }
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
+ * result = u_strCompare(skel, -1, skeletons[i], -1, FALSE) == 0;
+ * if (result == TRUE) { break; }
* }
- * // Has confusable in dictionary: 1 (success: 1)
- * printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status));
+ * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
+ * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
* free(skel);
* }
*
- * // Cleanup:
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
* free(skeletons[i]);
* }
* uspoof_close(sc);
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
*
* // Get the default set of allowable characters:
* USet* allowed = uset_openEmpty();
*
* int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
* UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
+ * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* uset_close(allowed);
* \endcode
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl"; // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
*
* // Get the default set of allowable characters:
* USet* allowed = uset_openEmpty();
* int32_t failures1 = bitmask;
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
* assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
*
* // Cleanup:
* uspoof_close(sc);
*
* \code{.cpp}
* UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
*
* // Get the default set of allowable characters:
* UnicodeSet allowed;
* int32_t failures1 = bitmask;
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
* assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
*
* // Explicit cleanup not necessary.
* \endcode
*
* \code{.c}
* UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"৪8";
+ * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
*
* USpoofChecker* sc = uspoof_open(&status);
* uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
*
* int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
* UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status)); // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
+ * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
* uspoof_close(sc);
* \endcode
*
*
* \code{.cpp}
* UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl"); // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
*
* // Get the default set of allowable characters:
* UnicodeSet allowed;
* int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
*
* URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
- * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask:
+ * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
* assert((restrictionLevel & bitmask) == restrictionLevel);
- * // Restriction level: 1342177280 (success: 1)
- * printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status));
+ * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
+ * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
* \endcode
*
* <p>
- * The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
+ * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
* USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
*
* <p>
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
*
* <p>
- * <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether
+ * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
* two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
* using the same USpoofChecker instance.
*
* <p>
* More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
- * thread safe. Those that take a non-const USpoofChecier are not thread safe..
+ * thread safe. Those that take a non-const USpoofChecker are not thread safe..
*
* @stable ICU 4.6
*/
* the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
* make {@link uspoof_areConfusable} return only those types of confusables.
*
- * <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
- * CONFUSABLE flags.
- *
* @see uspoof_areConfusable
* @see uspoof_getSkeleton
* @draft ICU 58
- * @provisional This API might change or be removed in a future release.
*/
USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
USPOOF_INVISIBLE = 32,
/** Check that an identifier contains only characters from a specified set
- * of acceptable characters. See {@link uspoof_setAllowedChars}
+ * of acceptable characters. See {@link uspoof_setAllowedChars} and
* {@link uspoof_setAllowedLocales}. Note that a string that fails this check
* will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
*/
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
/**
- * Set the loosest restriction level allowed for strings. The default if this is not called is
- * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
- * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
- * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
- * @param restrictionLevel The loosest restriction level allowed.
- * @see URestrictionLevel
- * @stable ICU 51
- */
+ * Set the loosest restriction level allowed for strings. The default if this is not called is
+ * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
+ * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
+ * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
+ *
+ * @param sc The USpoofChecker
+ * @param restrictionLevel The loosest restriction level allowed.
+ * @see URestrictionLevel
+ * @stable ICU 51
+ */
U_STABLE void U_EXPORT2
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
* @param sc The USpoofChecker
* @param id The identifier to be checked for possible security issues,
* in UTF-16 format.
+ * @param length the length of the string to be checked, or -1 if the string is
+ * zero terminated.
* @param checkResult An instance of USpoofCheckResult to be filled with
* details about the identifier. Can be NULL.
* @param status The error code, set if an error occurred while attempting to
*
* <ul>
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
- * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li>
+ * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
* </ul>
*
if (U_FAILURE(status)) { return; }
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
- allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
+ allowedCharsSet->freeze();
}
fSpoofData = src.fSpoofData->addReference();
}
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
- if (fAllowedCharsSet == NULL) {
+ fAllowedLocales = uprv_strdup(src.fAllowedLocales);
+ if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
- fAllowedLocales = uprv_strdup(src.fAllowedLocales);
fRestrictionLevel = src.fRestrictionLevel;
}
// Used to convert this CheckResult to the older int32_t return value API
int32_t toCombinedBitmask(int32_t expectedChecks);
- // Data Members (all stack-allocated)
+ // Data Members
int32_t fMagic; // Internal sanity check.
int32_t fChecks; // Bit vector of checks that were failed.
UnicodeSet fNumerics; // Set of numerics found in the string.
+++ /dev/null
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-* Copyright (C) 2008-2013, International Business Machines
-* Corporation and others. All Rights Reserved.
-*
-******************************************************************************
-* file name: uspoof_wsconf.cpp
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2009Jan05 (refactoring earlier files)
-* created by: Andy Heninger
-*
-* Internal functions for compililing Whole Script confusable source data
-* into its binary (runtime) form. The binary data format is described
-* in uspoof_impl.h
-*/
-
-#include "unicode/utypes.h"
-#include "unicode/uspoof.h"
-
-#if !UCONFIG_NO_NORMALIZATION
-
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-
-#include "unicode/unorm.h"
-#include "unicode/uregex.h"
-#include "unicode/ustring.h"
-#include "cmemory.h"
-#include "scriptset.h"
-#include "uspoof_impl.h"
-#include "uhash.h"
-#include "uvector.h"
-#include "uassert.h"
-#include "uspoof_wsconf.h"
-
-U_NAMESPACE_USE
-
-
-// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
-// Example Lines:
-// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
-// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
-// | | | |
-// | | | |---- Which table, Any Case or Lower Case (A or L)
-// | | |----------Target script. We need this.
-// | |----------------Src script. Should match the script of the source
-// | code points. Beyond checking that, we don't keep it.
-// |--------------------------------Source code points or range.
-//
-// The expression will match _all_ lines, including erroneous lines.
-// The result of the parse is returned via the contents of the (match) groups.
-static const char *parseExp =
- "(?m)" // Multi-line mode
- "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
- "|^(?:" // OR
- "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
- "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
- "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
- "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
- "[ \\t]*(?:#.*?)?" // Trailing commment
- ")$|" // OR
- "^(.*?)$"; // An error line. Group 8.
- // Any line not matching the preceding
- // parts of the expression.will match
- // this, and thus be flagged as an error
-
-
-// Extract a regular expression match group into a char * string.
-// The group must contain only invariant characters.
-// Used for script names
-//
-static void extractGroup(
- URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
-
- UChar ubuf[50];
- ubuf[0] = 0;
- destBuf[0] = 0;
- int32_t len = uregex_group(e, group, ubuf, 50, &status);
- if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
- return;
- }
- UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
- s.extract(0, len, destBuf, destCapacity, US_INV);
-}
-
-
-
-U_NAMESPACE_BEGIN
-
-// Build the Whole Script Confusable data
-//
-// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
-// because everything is local to this one build function anyhow,
-// OR
-// break this function into more reasonably sized pieces, with
-// state in WSConfusableDataBuilder.
-//
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
- int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
-{
- if (U_FAILURE(status)) {
- return;
- }
- URegularExpression *parseRegexp = NULL;
- int32_t inputLen = 0;
- UChar *input = NULL;
- int32_t lineNum = 0;
-
- UVector *scriptSets = NULL;
- uint32_t rtScriptSetsCount = 2;
-
- UTrie2 *anyCaseTrie = NULL;
- UTrie2 *lowerCaseTrie = NULL;
-
- anyCaseTrie = utrie2_open(0, 0, &status);
- lowerCaseTrie = utrie2_open(0, 0, &status);
-
- UnicodeString pattern(parseExp, -1, US_INV);
-
- // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
- //
- // Reserved TRIE values:
- // 0: Code point has no whole script confusables.
- // 1: Code point is of script Common or Inherited.
- // These code points do not participate in whole script confusable detection.
- // (This is logically equivalent to saying that they contain confusables in
- // all scripts)
- //
- // Because Trie values are indexes into the ScriptSets vector, pre-fill
- // vector positions 0 and 1 to avoid conflicts with the reserved values.
-
- scriptSets = new UVector(status);
- if (scriptSets == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- scriptSets->addElement((void *)NULL, status);
- scriptSets->addElement((void *)NULL, status);
-
- // Convert the user input data from UTF-8 to UChar (UTF-16)
- u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
- if (input == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
-
- parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
-
- // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
- // given the syntax of the input.
- if (*input == 0xfeff) {
- *input = 0x20;
- }
-
- // Parse the input, one line per iteration of this loop.
- uregex_setText(parseRegexp, input, inputLen, &status);
- while (uregex_findNext(parseRegexp, &status)) {
- lineNum++;
- if (uregex_start(parseRegexp, 1, &status) >= 0) {
- // this was a blank or comment line.
- continue;
- }
- if (uregex_start(parseRegexp, 8, &status) >= 0) {
- // input file syntax error.
- status = U_PARSE_ERROR;
- goto cleanup;
- }
- if (U_FAILURE(status)) {
- goto cleanup;
- }
-
- // Pick up the start and optional range end code points from the parsed line.
- UChar32 startCodePoint = SpoofImpl::ScanHex(
- input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
- UChar32 endCodePoint = startCodePoint;
- if (uregex_start(parseRegexp, 3, &status) >=0) {
- endCodePoint = SpoofImpl::ScanHex(
- input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
- }
-
- // Extract the two script names from the source line. We need these in an 8 bit
- // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
- // to the ICU u_getPropertyValueEnum() function. Ugh.
- char srcScriptName[20];
- char targScriptName[20];
- extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
- extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
- UScriptCode srcScript =
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
- UScriptCode targScript =
- static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
- if (U_FAILURE(status)) {
- goto cleanup;
- }
- if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
- status = U_INVALID_FORMAT_ERROR;
- goto cleanup;
- }
-
- // select the table - (A) any case or (L) lower case only
- UTrie2 *table = anyCaseTrie;
- if (uregex_start(parseRegexp, 7, &status) >= 0) {
- table = lowerCaseTrie;
- }
-
- // Build the set of scripts containing confusable characters for
- // the code point(s) specified in this input line.
- // Sanity check that the script of the source code point is the same
- // as the source script indicated in the input file. Failure of this check is
- // an error in the input file.
- // Include the source script in the set (needed for Mixed Script Confusable detection).
- //
- UChar32 cp;
- for (cp=startCodePoint; cp<=endCodePoint; cp++) {
- int32_t setIndex = utrie2_get32(table, cp);
- BuilderScriptSet *bsset = NULL;
- if (setIndex > 0) {
- U_ASSERT(setIndex < scriptSets->size());
- bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
- } else {
- bsset = new BuilderScriptSet();
- if (bsset == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- bsset->codePoint = cp;
- bsset->trie = table;
- bsset->sset = new ScriptSet();
- setIndex = scriptSets->size();
- bsset->index = setIndex;
- bsset->rindex = 0;
- if (bsset->sset == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- scriptSets->addElement(bsset, status);
- utrie2_set32(table, cp, setIndex, &status);
- }
- bsset->sset->set(targScript, status);
- bsset->sset->set(srcScript, status);
-
- if (U_FAILURE(status)) {
- goto cleanup;
- }
- UScriptCode cpScript = uscript_getScript(cp, &status);
- if (cpScript != srcScript) {
- status = U_INVALID_FORMAT_ERROR;
- goto cleanup;
- }
- }
- }
-
- // Eliminate duplicate script sets. At this point we have a separate
- // script set for every code point that had data in the input file.
- //
- // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
- //
- // printf("Number of scriptSets: %d\n", scriptSets->size());
- {
- int32_t duplicateCount = 0;
- rtScriptSetsCount = 2;
- for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
- BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
- if (outerSet->index != static_cast<uint32_t>(outeri)) {
- // This set was already identified as a duplicate.
- // It will not be allocated a position in the runtime array of ScriptSets.
- continue;
- }
- outerSet->rindex = rtScriptSetsCount++;
- for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
- BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
- if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
- delete innerSet->sset;
- innerSet->scriptSetOwned = FALSE;
- innerSet->sset = outerSet->sset;
- innerSet->index = outeri;
- innerSet->rindex = outerSet->rindex;
- duplicateCount++;
- }
- // But this doesn't get all. We need to fix the TRIE.
- }
- }
- // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
- }
-
-
-
- // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
- // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
- // are unused, which is why the loop index starts at 2.)
- {
- for (int32_t i=2; i<scriptSets->size(); i++) {
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- if (bSet->rindex != (uint32_t)i) {
- utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
- }
- }
- }
-
- // For code points with script==Common or script==Inherited,
- // Set the reserved value of 1 into both Tries. These characters do not participate
- // in Whole Script Confusable detection; this reserved value is the means
- // by which they are detected.
- {
- UnicodeSet ignoreSet;
- ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
- UnicodeSet inheritedSet;
- inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
- ignoreSet.addAll(inheritedSet);
- for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
- UChar32 rangeStart = ignoreSet.getRangeStart(rn);
- UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
- utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
- utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
- }
- }
-
- // Serialize the data to the Spoof Detector
- {
- utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
- int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
- // printf("Any case Trie size: %d\n", size);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
- spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
- void *where = spImpl->fSpoofData->reserveSpace(size, status);
- utrie2_serialize(anyCaseTrie, where, size, &status);
-
- utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
- size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
- // printf("Lower case Trie size: %d\n", size);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- goto cleanup;
- }
- status = U_ZERO_ERROR;
- spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
- spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
- where = spImpl->fSpoofData->reserveSpace(size, status);
- utrie2_serialize(lowerCaseTrie, where, size, &status);
-
- spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
- spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
- ScriptSet *rtScriptSets = static_cast<ScriptSet *>
- (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
- uint32_t rindex = 2;
- for (int32_t i=2; i<scriptSets->size(); i++) {
- BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- if (bSet->rindex < rindex) {
- // We have already copied this script set to the serialized data.
- continue;
- }
- U_ASSERT(rindex == bSet->rindex);
- rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
- rindex++;
- }
- }
-
- // Open new utrie2s from the serialized data. We don't want to keep the ones
- // we just built because we would then have two copies of the data, one internal to
- // the utries that we have already constructed, and one in the serialized data area.
- // An alternative would be to not pre-serialize the Trie data, but that makes the
- // spoof detector data different, depending on how the detector was constructed.
- // It's simpler to keep the data always the same.
-
- spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
- UTRIE2_16_VALUE_BITS,
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
- NULL,
- &status);
-
- spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
- UTRIE2_16_VALUE_BITS,
- (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
- spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
- NULL,
- &status);
-
-
-
-cleanup:
- if (U_FAILURE(status)) {
- pe->line = lineNum;
- }
- uregex_close(parseRegexp);
- uprv_free(input);
-
- int32_t i;
- if (scriptSets != NULL) {
- for (i=0; i<scriptSets->size(); i++) {
- BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
- delete bsset;
- }
- delete scriptSets;
- }
- utrie2_close(anyCaseTrie);
- utrie2_close(lowerCaseTrie);
- return;
-}
-
-U_NAMESPACE_END
-
-
-
-BuilderScriptSet::BuilderScriptSet() {
- codePoint = -1;
- trie = NULL;
- sset = NULL;
- index = 0;
- rindex = 0;
- scriptSetOwned = TRUE;
-}
-
-BuilderScriptSet::~BuilderScriptSet() {
- if (scriptSetOwned) {
- delete sset;
- }
-}
-
-#endif
-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
-
+++ /dev/null
-// Copyright (C) 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html
-/*
-******************************************************************************
-*
-* Copyright (C) 2008-2012, International Business Machines
-* Corporation and others. All Rights Reserved.
-*
-******************************************************************************
-* file name: uspoof_buildwsconf.h
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2009Jan19
-* created by: Andy Heninger
-*
-* Internal classes and functions
-* for compiling whole script confusable data into its binary (runtime) form.
-*/
-
-#ifndef __USPOOF_BUILDWSCONF_H__
-#define __USPOOF_BUILDWSCONF_H__
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_NORMALIZATION
-
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
-
-#include "uspoof_impl.h"
-#include "utrie2.h"
-
-
-U_NAMESPACE_BEGIN
-
-//
-// class BuilderScriptSet. Represents the set of scripts (Script Codes)
-// containing characters that are confusable with one specific
-// code point.
-//
-
-class BuilderScriptSet: public UMemory {
- public:
- UChar32 codePoint; // The source code point.
- UTrie2 *trie; // Any-case or Lower-case Trie.
- // These Trie tables are the final result of the
- // build. This flag indicates which of the two
- // this set of data is for.
- ScriptSet *sset; // The set of scripts itself.
-
- // Vectors of all B
- uint32_t index; // Index of this set in the Build Time vector
- // of script sets.
- uint32_t rindex; // Index of this set in the final (runtime)
- // array of sets.
- UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete)
- // its underlying sset.
-
- BuilderScriptSet();
- ~BuilderScriptSet();
-};
-
-
-void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
- int32_t confusablesWSLen, UParseError *pe, UErrorCode &status);
-
-U_NAMESPACE_END
-
-#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
-#endif // !UCONFIG_NO_NORMALIZATION
-#endif
const UChar* tests[] = { goodLatin, scMixed, scLatin,
goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
- for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
+ for (int32_t i=0; i<UPRV_LENGTHOF(tests); i++) {
const UChar* str = tests[i];
// Basic test