uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o decfmtst.o smpdtfst.o \
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
-tzfmt.o compactdecimalformat.o gender.o region.o
+tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o identifier_info.o
## Header files to install
HEADERS = $(srcdir)/unicode/*.h
<ClCompile Include="gregocal.cpp" />\r
<ClCompile Include="gregoimp.cpp" />\r
<ClCompile Include="hebrwcal.cpp" />\r
+ <ClCompile Include="identifier_info.cpp" />\r
<ClCompile Include="indiancal.cpp" />\r
<ClCompile Include="islamcal.cpp" />\r
<ClCompile Include="japancal.cpp" />\r
<ClCompile Include="reldtfmt.cpp" />\r
<ClCompile Include="selfmt.cpp" />\r
<ClCompile Include="simpletz.cpp" />\r
+ <ClCompile Include="scriptset.cpp" />\r
<ClCompile Include="smpdtfmt.cpp" />\r
<ClCompile Include="smpdtfst.cpp" />\r
<ClCompile Include="taiwncal.cpp" />\r
</Command>\r
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>\r
</CustomBuild>\r
+ <ClInclude Include="identifier_info.h" />\r
+ <ClInclude Include="scriptset.h" />\r
<ClInclude Include="uspoof_conf.h" />\r
<ClInclude Include="uspoof_impl.h" />\r
<ClInclude Include="uspoof_wsconf.h" />\r
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />\r
<ImportGroup Label="ExtensionTargets">\r
</ImportGroup>\r
-</Project>
\ No newline at end of file
+</Project>\r
<ClCompile Include="ucsdet.cpp">\r
<Filter>charset detect</Filter>\r
</ClCompile>\r
+ <ClCompile Include="identifier_info.cpp">\r
+ <Filter>spoof</Filter>\r
+ </ClCompile>\r
+ <ClCompile Include="scriptset.cpp">\r
+ <Filter>spoof</Filter>\r
+ </ClCompile>\r
<ClCompile Include="uspoof.cpp">\r
<Filter>spoof</Filter>\r
</ClCompile>\r
<ClInclude Include="inputext.h">\r
<Filter>charset detect</Filter>\r
</ClInclude>\r
+ <ClInclude Include="identifier_info.h">\r
+ <Filter>spoof</Filter>\r
+ </ClInclude>\r
+ <ClInclude Include="scriptset.h">\r
+ <Filter>spoof</Filter>\r
+ </ClInclude>\r
<ClInclude Include="uspoof_conf.h">\r
<Filter>spoof</Filter>\r
</ClInclude>\r
<Filter>formatting</Filter>\r
</CustomBuild>\r
</ItemGroup>\r
-</Project>
\ No newline at end of file
+</Project>\r
--- /dev/null
+/*
+**********************************************************************
+* Copyright (C) 2012-2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*/
+
+#include "unicode/utypes.h"
+
+#include "unicode/uchar.h"
+#include "unicode/utf16.h"
+
+#include "identifier_info.h"
+#include "mutex.h"
+#include "scriptset.h"
+#include "ucln_in.h"
+#include "uvector.h"
+
+U_NAMESPACE_BEGIN
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+static UMutex gInitMutex = U_MUTEX_INITIALIZER;
+static UBool gStaticsAreInitialized = FALSE;
+
+UnicodeSet *IdentifierInfo::ASCII;
+ScriptSet *IdentifierInfo::JAPANESE;
+ScriptSet *IdentifierInfo::CHINESE;
+ScriptSet *IdentifierInfo::KOREAN;
+ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
+
+UBool IdentifierInfo::cleanup() {
+ delete ASCII;
+ ASCII = NULL;
+ delete JAPANESE;
+ JAPANESE = NULL;
+ delete CHINESE;
+ CHINESE = NULL;
+ delete KOREAN;
+ KOREAN = NULL;
+ delete CONFUSABLE_WITH_LATIN;
+ CONFUSABLE_WITH_LATIN = NULL;
+ gStaticsAreInitialized = FALSE;
+ return TRUE;
+}
+
+U_CDECL_BEGIN
+static UBool U_CALLCONV
+IdentifierInfo_cleanup(void) {
+ return IdentifierInfo::cleanup();
+}
+U_CDECL_END
+
+
+IdentifierInfo::IdentifierInfo(UErrorCode &status):
+ fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
+ fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ {
+ Mutex lock(&gInitMutex);
+ if (!gStaticsAreInitialized) {
+ ASCII = new UnicodeSet(0, 0x7f);
+ JAPANESE = new ScriptSet();
+ CHINESE = new ScriptSet();
+ KOREAN = new ScriptSet();
+ CONFUSABLE_WITH_LATIN = new ScriptSet();
+ if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
+ || CONFUSABLE_WITH_LATIN == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ ASCII->freeze();
+ JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
+ .set(USCRIPT_KATAKANA, status);
+ CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
+ KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
+ CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
+ .set(USCRIPT_CHEROKEE, status);
+ ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
+ gStaticsAreInitialized = TRUE;
+ }
+ }
+ fIdentifier = new UnicodeString();
+ fRequiredScripts = new ScriptSet();
+ fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
+ uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
+ fCommonAmongAlternates = new ScriptSet();
+ fNumerics = new UnicodeSet();
+ fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
+
+ if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
+ fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+};
+
+IdentifierInfo::~IdentifierInfo() {
+ delete fIdentifier;
+ delete fRequiredScripts;
+ uhash_close(fScriptSetSet);
+ delete fCommonAmongAlternates;
+ delete fNumerics;
+ delete fIdentifierProfile;
+};
+
+
+IdentifierInfo &IdentifierInfo::clear() {
+ fRequiredScripts->resetAll();
+ uhash_removeAll(fScriptSetSet);
+ fNumerics->clear();
+ fCommonAmongAlternates->resetAll();
+ return *this;
+}
+
+
+IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
+ *fIdentifierProfile = identifierProfile;
+ return *this;
+}
+
+
+const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
+ return *fIdentifierProfile;
+}
+
+
+IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ *fIdentifier = identifier;
+ clear();
+ ScriptSet scriptsForCP;
+ UChar32 cp;
+ for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
+ cp = identifier.char32At(i);
+ // Store a representative character for each kind of decimal digit
+ if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
+ // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
+ fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
+ }
+ UScriptCode extensions[500];
+ int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ scriptsForCP.resetAll();
+ for (int32_t j=0; j<extensionsCount; j++) {
+ scriptsForCP.set(extensions[j], status);
+ }
+ scriptsForCP.reset(USCRIPT_COMMON, status);
+ scriptsForCP.reset(USCRIPT_INHERITED, status);
+ switch (scriptsForCP.countMembers()) {
+ case 0: break;
+ case 1:
+ // Single script, record it.
+ fRequiredScripts->Union(scriptsForCP);
+ break;
+ default:
+ if (!fRequiredScripts->intersects(scriptsForCP)
+ && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
+ // If the set hasn't been added already, add it
+ // (Add a copy, fScriptSetSet takes ownership of the copy.)
+ uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
+ }
+ break;
+ }
+ }
+ // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
+ // [Kana], [Kana Hira] => [Kana]
+ // This is relatively infrequent, so doesn't have to be optimized.
+ // We also compute any commonalities among the alternates.
+ if (uhash_count(fScriptSetSet) > 0) {
+ fCommonAmongAlternates->setAll();
+ for (int32_t it = -1;;) {
+ const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
+ if (nextHashEl == NULL) {
+ break;
+ }
+ ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
+ // [Kana], [Kana Hira] => [Kana]
+ if (fRequiredScripts->intersects(*next)) {
+ uhash_removeElement(fScriptSetSet, nextHashEl);
+ } else {
+ fCommonAmongAlternates->intersect(*next);
+ // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
+ for (int32_t otherIt = -1;;) {
+ const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
+ if (otherHashEl == NULL) {
+ break;
+ }
+ ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
+ if (next != other && next->contains(*other)) {
+ uhash_removeElement(fScriptSetSet, nextHashEl);
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (uhash_count(fScriptSetSet) == 0) {
+ fCommonAmongAlternates->resetAll();
+ }
+ return *this;
+}
+
+
+const UnicodeString *IdentifierInfo::getIdentifier() const {
+ return fIdentifier;
+}
+
+const ScriptSet *IdentifierInfo::getScripts() const {
+ return fRequiredScripts;
+}
+
+const UHashtable *IdentifierInfo::getAlternates() const {
+ return fScriptSetSet;
+}
+
+
+const UnicodeSet *IdentifierInfo::getNumerics() const {
+ return fNumerics;
+}
+
+const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
+ return fCommonAmongAlternates;
+}
+
+URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
+ if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
+ return USPOOF_UNRESTRICTIVE;
+ }
+ if (ASCII->containsAll(*fIdentifier)) {
+ return USPOOF_ASCII;
+ }
+ // This is a bit tricky. We look at a number of factors.
+ // The number of scripts in the text.
+ // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
+ // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
+
+ // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
+ // time it is created, in setIdentifier().
+ int32_t cardinalityPlus = fRequiredScripts->countMembers() +
+ (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
+ if (cardinalityPlus < 2) {
+ return USPOOF_HIGHLY_RESTRICTIVE;
+ }
+ if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
+ || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
+ return USPOOF_HIGHLY_RESTRICTIVE;
+ }
+ if (cardinalityPlus == 2 &&
+ fRequiredScripts->test(USCRIPT_LATIN, status) &&
+ !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
+ return USPOOF_MODERATELY_RESTRICTIVE;
+ }
+ return USPOOF_MINIMALLY_RESTRICTIVE;
+}
+
+int32_t IdentifierInfo::getScriptCount() const {
+ // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
+ int32_t count = fRequiredScripts->countMembers() +
+ (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
+ return count;
+}
+
+
+
+UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
+ if (!container.contains(containee)) {
+ return FALSE;
+ }
+ for (int32_t iter = -1; ;) {
+ const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
+ if (hashEl == NULL) {
+ break;
+ }
+ ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
+ if (!container.intersects(*alternatives)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
+ UVector sorted(status);
+ if (U_FAILURE(status)) {
+ return dest;
+ }
+ for (int32_t pos = -1; ;) {
+ const UHashElement *el = uhash_nextElement(alternates, &pos);
+ if (el == NULL) {
+ break;
+ }
+ ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
+ sorted.addElement(ss, status);
+ }
+ sorted.sort(uhash_compareScriptSet, status);
+ UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
+ for (int32_t i=0; i<sorted.size(); i++) {
+ if (i>0) {
+ dest.append(separator);
+ }
+ ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
+ ss->displayScripts(dest);
+ }
+ return dest;
+}
+
+U_NAMESPACE_END
+
--- /dev/null
+/*
+**********************************************************************
+* Copyright (C) 2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*
+* indentifier_info.h
+*
+* created on: 2013 Jan 7
+* created by: Andy Heninger
+*/
+
+#ifndef __IDENTIFIER_INFO_H__
+#define __IDENTIFIER_INFO_H__
+
+#include "unicode/utypes.h"
+
+#include "unicode/uniset.h"
+#include "unicode/uspoof.h"
+#include "uhash.h"
+
+U_NAMESPACE_BEGIN
+
+class ScriptSet;
+
+// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
+
+/**
+ * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
+ * then setIdentifier. Available methods include:
+ * <ol>
+ * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
+ * each of these.
+ * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
+ * either Katakana or Hiragana.
+ * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
+ * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
+ * the identifier.
+ * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
+ * </ol>
+ *
+ * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
+ */
+class U_I18N_API IdentifierInfo : public UMemory {
+
+ public:
+ /**
+ * Create an identifier info object. Subsequently, call setIdentifier(), etc.
+ * @internal
+ */
+ IdentifierInfo(UErrorCode &status);
+
+ /**
+ * Destructor
+ */
+ virtual ~IdentifierInfo();
+
+ private:
+ /* Disallow copying for now. Can be added if there's a need. */
+ IdentifierInfo(const IdentifierInfo &other);
+
+ public:
+
+ /**
+ * Set the identifier profile: the characters that are to be allowed in the identifier.
+ *
+ * @param identifierProfile the characters that are to be allowed in the identifier
+ * @return this
+ * @internal
+ */
+ IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
+
+ /**
+ * Get the identifier profile: the characters that are to be allowed in the identifier.
+ *
+ * @return The characters that are to be allowed in the identifier.
+ * @internal
+ */
+ const UnicodeSet &getIdentifierProfile() const;
+
+
+ /**
+ * Set an identifier to analyze. Afterwards, call methods like getScripts()
+ *
+ * @param identifier the identifier to analyze
+ * @param status Errorcode, set if errors occur.
+ * @return this
+ * @internal
+ */
+ IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
+
+
+ /**
+ * Get the identifier that was analyzed. The returned string is owned by the ICU library,
+ * and must not be deleted by the caller.
+ *
+ * @return the identifier that was analyzed.
+ * @internal
+ */
+ const UnicodeString *getIdentifier() const;
+
+
+ /**
+ * Get the scripts found in the identifiers.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ const ScriptSet *getScripts() const;
+
+ /**
+ * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
+ * the set consisting of those scripts will be returned.
+ *
+ * @return a uhash, with each key being of type (ScriptSet *).
+ * This is a set, not a map, so the value stored in the uhash is not relevant.
+ * (It is, in fact, 1).
+ * Ownership of the uhash and its contents remains with the IndetifierInfo object,
+ * and remains valid until a new identifer is set or until the object is deleted.
+ * @internal
+ */
+ const UHashtable *getAlternates() const;
+
+ /**
+ * Get the representative characters (zeros) for the numerics found in the identifier.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ const UnicodeSet *getNumerics() const;
+
+ /**
+ * Find out which scripts are in common among the alternates.
+ *
+ * @return the set of scripts that are in common among the alternates.
+ * @internal
+ */
+ const ScriptSet *getCommonAmongAlternates() const;
+
+ /**
+ * Get the number of scripts appearing in the identifier.
+ * Note: Common and Inherited scripts are omitted from the count.
+ * Note: Result may be high when the identifier contains characters
+ * with alternate scripts. The distinction between
+ * 0, 1 and > 1 will remain valid, however.
+ * @return the number of scripts.
+ */
+ int32_t getScriptCount() const;
+
+ /**
+ * Find the "tightest" restriction level that the identifier satisfies.
+ *
+ * @return the restriction level.
+ * @internal
+ */
+ URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
+
+ UnicodeString toString() const;
+
+ /**
+ * Produce a readable string of alternates.
+ *
+ * @param alternates a UHashtable of UScriptSets.
+ * Keys only, no meaningful values in the UHash.
+ * @return display form
+ * @internal
+ */
+ static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
+
+ /**
+ * Static memory cleanup function.
+ * @internal
+ */
+ static UBool cleanup();
+ private:
+
+ IdentifierInfo & clear();
+ UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
+
+ UnicodeString *fIdentifier;
+ ScriptSet *fRequiredScripts;
+ UHashtable *fScriptSetSet;
+ ScriptSet *fCommonAmongAlternates;
+ UnicodeSet *fNumerics;
+ UnicodeSet *fIdentifierProfile;
+
+ static UnicodeSet *ASCII;
+ static ScriptSet *JAPANESE;
+ static ScriptSet *CHINESE;
+ static ScriptSet *KOREAN;
+ static ScriptSet *CONFUSABLE_WITH_LATIN;
+
+
+
+};
+
+U_NAMESPACE_END
+
+#endif // __IDENTIFIER_INFO_H__
+
--- /dev/null
+/*
+**********************************************************************
+* Copyright (C) 2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*
+* scriptset.cpp
+*
+* created on: 2013 Jan 7
+* created by: Andy Heninger
+*/
+
+#include "unicode/utypes.h"
+
+#include "unicode/uchar.h"
+#include "unicode/unistr.h"
+
+#include "scriptset.h"
+#include "uassert.h"
+
+U_NAMESPACE_BEGIN
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+//----------------------------------------------------------------------------
+//
+// ScriptSet implementation
+//
+//----------------------------------------------------------------------------
+ScriptSet::ScriptSet() {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] = 0;
+ }
+}
+
+ScriptSet::~ScriptSet() {
+}
+
+ScriptSet::ScriptSet(const ScriptSet &other) {
+ *this = other;
+}
+
+
+ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] = other.bits[i];
+ }
+ return *this;
+}
+
+
+UBool ScriptSet::operator == (const ScriptSet &other) const {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ if (bits[i] != other.bits[i]) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+UBool ScriptSet::test(UScriptCode script, UErrorCode &status) const {
+ if (U_FAILURE(status)) {
+ return FALSE;
+ }
+ if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return FALSE;
+ }
+ uint32_t index = script / 32;
+ uint32_t bit = 1 << (script & 31);
+ return ((bits[index] & bit) != 0);
+}
+
+
+ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ uint32_t index = script / 32;
+ uint32_t bit = 1 << (script & 31);
+ bits[index] |= bit;
+ return *this;
+}
+
+ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ if (script < 0 || script >= (int32_t)sizeof(bits) * 8) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ uint32_t index = script / 32;
+ uint32_t bit = 1 << (script & 31);
+ bits[index] &= ~bit;
+ return *this;
+}
+
+
+
+ScriptSet &ScriptSet::Union(const ScriptSet &other) {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] |= other.bits[i];
+ }
+ return *this;
+}
+
+ScriptSet &ScriptSet::intersect(const ScriptSet &other) {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] &= other.bits[i];
+ }
+ return *this;
+}
+
+ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) {
+ ScriptSet t;
+ t.set(script, status);
+ if (U_SUCCESS(status)) {
+ this->intersect(t);
+ }
+ return *this;
+}
+
+UBool ScriptSet::intersects(const ScriptSet &other) const {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ if ((bits[i] & other.bits[i]) != 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+UBool ScriptSet::contains(const ScriptSet &other) const {
+ ScriptSet t(*this);
+ t.intersect(other);
+ return (t == other);
+}
+
+
+ScriptSet &ScriptSet::setAll() {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] = 0xffffffffu;
+ }
+ return *this;
+}
+
+
+ScriptSet &ScriptSet::resetAll() {
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ bits[i] = 0;
+ }
+ return *this;
+}
+
+int32_t ScriptSet::countMembers() const {
+ // This bit counter is good for sparse numbers of '1's, which is
+ // very much the case that we will usually have.
+ int32_t count = 0;
+ for (uint32_t i=0; i<LENGTHOF(bits); i++) {
+ uint32_t x = bits[i];
+ while (x > 0) {
+ count++;
+ x &= (x - 1); // and off the least significant one bit.
+ }
+ }
+ return count;
+}
+
+int32_t ScriptSet::hashCode() const {
+ int32_t hash = 0;
+ for (int32_t i=0; i<LENGTHOF(bits); i++) {
+ hash ^= bits[i];
+ }
+ return hash;
+}
+
+int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
+ // TODO: Wants a better implementation.
+ if (fromIndex < 0) {
+ return -1;
+ }
+ UErrorCode status = U_ZERO_ERROR;
+ for (int32_t scriptIndex = fromIndex; scriptIndex < (int32_t)sizeof(bits)*8; scriptIndex++) {
+ if (test((UScriptCode)scriptIndex, status)) {
+ return scriptIndex;
+ }
+ }
+ return -1;
+}
+
+UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
+ UBool firstTime = TRUE;
+ for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
+ if (!firstTime) {
+ dest.append(0x20);
+ }
+ firstTime = FALSE;
+ const char *scriptName = uscript_getShortName((UScriptCode(i)));
+ dest.append(UnicodeString(scriptName, -1, US_INV));
+ }
+ return dest;
+}
+
+ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) {
+ resetAll();
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ UnicodeString oneScriptName;
+ for (int32_t i=0; i<scriptString.length();) {
+ UChar32 c = scriptString.char32At(i);
+ i = scriptString.moveIndex32(i, 1);
+ if (!u_isUWhiteSpace(c)) {
+ oneScriptName.append(c);
+ if (i < scriptString.length()) {
+ continue;
+ }
+ }
+ if (oneScriptName.length() > 0) {
+ char buf[40];
+ oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV);
+ buf[sizeof(buf)-1] = 0;
+ int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf);
+ if (sc == UCHAR_INVALID_CODE) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ } else {
+ this->set((UScriptCode)sc, status);
+ }
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ oneScriptName.remove();
+ }
+ }
+ return *this;
+}
+
+U_NAMESPACE_END
+
+U_CAPI UBool U_EXPORT2
+uhash_equalsScriptSet(const UElement key1, const UElement key2) {
+ icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
+ icu::ScriptSet *s2 = static_cast<icu::ScriptSet *>(key2.pointer);
+ return (*s1 == *s2);
+}
+
+U_CAPI int8_t U_EXPORT2
+uhash_compareScriptSet(UElement key0, UElement key1) {
+ icu::ScriptSet *s0 = static_cast<icu::ScriptSet *>(key0.pointer);
+ icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
+ int32_t diff = s0->countMembers() - s1->countMembers();
+ if (diff != 0) return diff;
+ int32_t i0 = s0->nextSetBit(0);
+ int32_t i1 = s1->nextSetBit(0);
+ while ((diff = i0-i1) == 0 && i0 > 0) {
+ i0 = s0->nextSetBit(i0+1);
+ i1 = s1->nextSetBit(i1+1);
+ }
+ return (int8_t)diff;
+}
+
+U_CAPI int32_t U_EXPORT2
+uhash_hashScriptSet(const UElement key) {
+ icu::ScriptSet *s = static_cast<icu::ScriptSet *>(key.pointer);
+ return s->hashCode();
+}
+
+U_CAPI void U_EXPORT2
+uhash_deleteScriptSet(void *obj) {
+ icu::ScriptSet *s = static_cast<icu::ScriptSet *>(obj);
+ delete s;
+}
--- /dev/null
+/*
+**********************************************************************
+* Copyright (C) 2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*
+* scriptset.h
+*
+* created on: 2013 Jan 7
+* created by: Andy Heninger
+*/
+
+#ifndef __SCRIPTSET_H__
+#define __SCRIPTSET_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uobject.h"
+#include "unicode/uscript.h"
+
+#include "uelement.h"
+
+U_NAMESPACE_BEGIN
+
+//-------------------------------------------------------------------------------
+//
+// ScriptSet - A bit set representing a set of scripts.
+//
+// This class was originally used exclusively with script sets appearing
+// as part of the spoof check whole script confusable binary data. Its
+// use has since become more general, but the continued use to wrap
+// prebuilt binary data does constrain the design.
+//
+//-------------------------------------------------------------------------------
+class U_I18N_API ScriptSet: public UMemory {
+ public:
+ ScriptSet();
+ ScriptSet(const ScriptSet &other);
+ ~ScriptSet();
+
+ UBool operator == (const ScriptSet &other) const;
+ ScriptSet & operator = (const ScriptSet &other);
+
+ UBool test(UScriptCode script, UErrorCode &status) const;
+ ScriptSet &Union(const ScriptSet &other);
+ ScriptSet &set(UScriptCode script, UErrorCode &status);
+ ScriptSet &reset(UScriptCode script, UErrorCode &status);
+ ScriptSet &intersect(const ScriptSet &other);
+ ScriptSet &intersect(UScriptCode script, UErrorCode &status);
+ UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in commmon.
+ UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this.
+
+ ScriptSet &setAll();
+ ScriptSet &resetAll();
+ int32_t countMembers() const;
+ int32_t hashCode() const;
+ int32_t nextSetBit(int32_t script) const;
+
+ UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string.
+ ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents.
+
+ private:
+ uint32_t bits[6];
+};
+
+U_NAMESPACE_END
+
+U_CAPI UBool U_EXPORT2
+uhash_compareScriptSet(const UElement key1, const UElement key2);
+
+U_CAPI int32_t U_EXPORT2
+uhash_hashScriptSet(const UElement key);
+
+U_CAPI void U_EXPORT2
+uhash_deleteScriptSet(void *obj);
+
+#endif // __SCRIPTSET_H__
It's usually best to have child dependencies called first. */
typedef enum ECleanupI18NType {
UCLN_I18N_START = -1,
+ UCLN_I18N_IDENTIFIER_INFO,
+ UCLN_I18N_SPOOF,
UCLN_I18N_TRANSLITERATOR,
UCLN_I18N_REGEX,
UCLN_I18N_ISLAMIC_CALENDAR,
*/
AlphabeticIndex(const Locale &locale, UErrorCode &status);
+ /**
+ * Construct an AlphabeticIndex that uses a specific collator.
+ *
+ * The index will be created with no labels; the addLabels() function must be called
+ * after creation to add the desired labels to the index.
+ *
+ * The index adopts the collator, and is responsible for deleting it.
+ * The caller should make nor further use of the collator after creating the index.
+ *
+ * @param collator The collator to use to order the contents of this index.
+ * @param status Error code, will be set with the reason if the
+ * operation fails.
+ * @draft ICU 51
+ */
+ AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status);
/**
* Construct an AlphabeticIndex that uses a specific collator.
/*
***************************************************************************
-* Copyright (C) 2008-2012, International Business Machines Corporation
+* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* file name: uspoof.h
Any Case Confusable. */
USPOOF_ANY_CASE = 8,
+ /**
+ * Check that an identifier is no looser than the specified RestrictionLevel.
+ * The default if uspoof_setRestrctionLevel() is not called is HIGHLY_RESTRICTIVE.
+ *
+ * If USPOOF_AUX_INFO is enabled the actual restriction level of the
+ * identifier being tested will also be returned by uspoof_check().
+ *
+ * @see URestrictionLevel
+ * @see uspoof_setRestrictionLevel
+ * @see USPOOF_AUX_INFO
+ *
+ * @stable ICU 51
+ */
+ USPOOF_RESTRICTION_LEVEL = 16,
+
/** Check that an identifier contains only characters from a
* single script (plus chars from the common and inherited scripts.)
* Applies to checks of a single identifier check only.
+ * @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
*/
- USPOOF_SINGLE_SCRIPT = 16,
+ USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
/** Check an identifier for the presence of invisible characters,
* such as zero-width spaces, or character sequences that are
*/
USPOOF_CHAR_LIMIT = 64,
- USPOOF_ALL_CHECKS = 0x7f
+ /**
+ * Check that an identifier does not include decimal digits from
+ * more than one numbering system.
+ *
+ * @draft ICU 51
+ */
+ USPOOF_MIXED_NUMBERS = 128,
+
+ /**
+ * Enable all spoof checks.
+ *
+ * @stable ICU 4.6
+ */
+ USPOOF_ALL_CHECKS = 0xFFFF,
+
+ /**
+ * Enable the return of auxillary (non-error) information in the
+ * upper bits of the check results value.
+ *
+ * If this "check" is not enabled, the results of uspoof_check() will be zero when an
+ * identifier passes all of the enabled checks.
+ *
+ * If this "check" is enabled, (uspoof_check() & USPOOF_ALL_CHECKS) will be zero
+ * when an identifier passes all checks.
+ *
+ * @draft ICU 51
+ */
+ USPOOF_AUX_INFO = 0x40000000
+
} USpoofChecks;
+ /**
+ * Constants from UAX #39 for use in setRestrictionLevel(), and
+ * for returned identifier restriction levels in check results.
+ * @draft ICU 51
+ */
+ typedef enum URestrictionLevel {
+ /**
+ * Only ASCII characters: U+0000..U+007F
+ *
+ * @draft ICU 51
+ */
+ USPOOF_ASCII = 0x10000000,
+ /**
+ * All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
+ * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
+ * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
+ *
+ * @draft ICU 51
+ */
+ USPOOF_HIGHLY_RESTRICTIVE = 0x20000000,
+ /**
+ * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
+ *
+ * @draft ICU 51
+ */
+ USPOOF_MODERATELY_RESTRICTIVE = 0x30000000,
+ /**
+ * Allow arbitrary mixtures of scripts. Otherwise, the same as Moderately Restrictive.
+ *
+ * @draft ICU 51
+ */
+ USPOOF_MINIMALLY_RESTRICTIVE = 0x40000000,
+ /**
+ * Any valid identifiers, including characters outside of the Identifier Profile.
+ *
+ * @draft ICU 51
+ */
+ USPOOF_UNRESTRICTIVE = 0x50000000
+ } URestrictionLevel;
+
/**
* Create a Unicode Spoof Checker, configured to perform all
* checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
* Open a Spoof Checker from the source form of the spoof data.
* The Three inputs correspond to the Unicode data files confusables.txt
* confusablesWholeScript.txt and xidmdifications.txt as described in
- * Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
+ * Unicode UAX #39. The syntax of the source data is as described in UAX #39 for
* these files, and the content of these files is acceptable input.
*
* The character encoding of the (char *) input text is UTF-8.
U_STABLE int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
+/**
+ * Set the loosest restriction level allowed. The default if this function
+ * is not called is HIGHLY_RESTRICTIVE.
+ * Calling this function also enables the RESTRICTION_LEVEL check.
+ * @param restrictionLevel The loosest restriction level allowed.
+ * @see URestrictionLevel
+ * @draft ICU 51
+ */
+U_DRAFT void U_EXPORT2
+uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
+
+
+/**
+ * Get the Restriction Level that will be tested if the checks include RESTRICTION_LEVEL.
+ *
+ * @return The restriction level
+ * @see URestrictionLevel
+ * @draft ICU 51
+ */
+U_DRAFT URestrictionLevel U_EXPORT2
+uspoof_getRestrictionLevel(const USpoofChecker *sc);
+
/**
* Limit characters that are acceptable in identifiers being checked to those
* normally used with the languages associated with the specified locales.
* characters that are permitted. Ownership of the set
* remains with the caller. The incoming set is cloned by
* this function, so there are no restrictions on modifying
- * or deleting the USet after calling this function.
+ * or deleting the UnicodeSet after calling this function.
* @param status The error code, set if this function encounters a problem.
* @stable ICU 4.2
*/
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
- * @param text The string to be checked for possible security issues,
+ * @param id The identifier to be checked for possible security issues,
* in UTF-16 format.
* @param length the length of the string to be checked, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
- * @param position An out parameter that receives the index of the
- * first string position that fails the allowed character
- * limitation checks.
- * This parameter may be null if the position information
- * is not needed.
- * If the string passes the requested checks the
- * parameter value will not be set.
+ * @param position An out parameter.
+ * Originally, the index of the first string position that failed a check.
+ * Now, always returns zero.
+ * This parameter may be null.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* not reported here, but through the function's return value.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
- * enum USpoofChecks. Zero is returned if no issues
- * are found with the input string.
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
- const UChar *text, int32_t length,
+ const UChar *id, int32_t length,
int32_t *position,
UErrorCode *status);
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
- * @param text A UTF-8 string to be checked for possible security issues.
+ * @param id A identifier to be checked for possible security issues, in UTF8 format.
* @param length the length of the string to be checked, or -1 if the string is
* zero terminated.
- * @param position An out parameter that receives the index of the
- * first string position that fails the allowed character
- * limitation checks.
- * This parameter may be null if the position information
- * is not needed.
- * If the string passes the requested checks the
- * parameter value will not be set.
+ * @param position An out parameter.
+ * Originally, the index of the first string position that failed a check.
+ * Now, always returns zero.
+ * This parameter may be null.
+ * @deprecated ICU 51
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* a status of U_INVALID_CHAR_FOUND will be returned.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
- * enum USpoofChecks. Zero is returned if no issues
- * are found with the input string.
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker *sc,
- const char *text, int32_t length,
+ const char *id, int32_t length,
int32_t *position,
UErrorCode *status);
* The set of checks to be performed is specified with uspoof_setChecks().
*
* @param sc The USpoofChecker
- * @param text A UnicodeString to be checked for possible security issues.
- * @param position An out parameter that receives the index of the
- * first string position that fails the allowed character
- * limitation checks.
- * This parameter may be null if the position information
- * is not needed.
- * If the string passes the requested checks the
- * parameter value will not be set.
+ * @param id A identifier to be checked for possible security issues.
+ * @param position An out parameter.
+ * Originally, the index of the first string position that failed a check.
+ * Now, always returns zero.
+ * This parameter may be null.
+ * @deprecated ICU 51
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* not reported here, but through the function's return value.
-
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
- * enum USpoofChecks. Zero is returned if no issues
- * are found with the input string.
+ * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
+ * will be zero if the input string passes all of the
+ * enabled checks.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
- const icu::UnicodeString &text,
+ const icu::UnicodeString &id,
int32_t *position,
UErrorCode *status);
*
*
* @param sc The USpoofChecker
- * @param s1 The first of the two strings to be compared for
+ * @param id1 The first of the two identifiers to be compared for
* confusability. The strings are in UTF-16 format.
- * @param length1 the length of the first string, expressed in
+ * @param length1 the length of the first identifer, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
- * zero terminated.
- * @param s2 The second of the two strings to be compared for
- * confusability. The strings are in UTF-16 format.
- * @param length2 The length of the second string, expressed in
+ * nul terminated.
+ * @param id2 The second of the two identifiers to be compared for
+ * confusability. The identifiers are in UTF-16 format.
+ * @param length2 The length of the second identifiers, expressed in
* 16 bit UTF-16 code units, or -1 if the string is
- * zero terminated.
+ * nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
- * Confusability of the strings is not reported here,
+ * Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
- * enum USpoofChecks. Zero is returned if the strings
+ * enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
* @stable ICU 4.2
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusable(const USpoofChecker *sc,
- const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
+ const UChar *id1, int32_t length1,
+ const UChar *id2, int32_t length2,
UErrorCode *status);
* USpoofChecker.
*
* @param sc The USpoofChecker
- * @param s1 The first of the two strings to be compared for
+ * @param id1 The first of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param length1 the length of the first identifiers, in bytes, or -1
+ * if the string is nul terminated.
+ * @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
- * @param length1 the length of the first string, in bytes, or -1
- * if the string is zero terminated.
- * @param s2 The second of the two strings to be compared for
- * confusability. The strings are in UTF-18 format.
* @param length2 The length of the second string in bytes, or -1
- * if the string is zero terminated.
+ * if the string is nul terminated.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* Confusability of the strings is not reported here,
*/
U_STABLE int32_t U_EXPORT2
uspoof_areConfusableUTF8(const USpoofChecker *sc,
- const char *s1, int32_t length1,
- const char *s2, int32_t length2,
+ const char *id1, int32_t length1,
+ const char *id2, int32_t length2,
UErrorCode *status);
* USpoofChecker.
*
* @param sc The USpoofChecker
- * @param s1 The first of the two strings to be compared for
+ * @param id1 The first of the two identifiers to be compared for
+ * confusability. The strings are in UTF-8 format.
+ * @param id2 The second of the two identifiers to be compared for
* confusability. The strings are in UTF-8 format.
- * @param s2 The second of the two strings to be compared for
- * confusability. The strings are in UTF-18 format.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
- * Confusability of the strings is not reported here,
+ * Confusability of the identifiers is not reported here,
* but through this function's return value.
* @return An integer value with bit(s) set corresponding to
* the type of confusability found, as defined by
- * enum USpoofChecks. Zero is returned if the strings
+ * enum USpoofChecks. Zero is returned if the identifiers
* are not confusable.
* @stable ICU 4.2
*/
/**
- * Get the "skeleton" for an identifier string.
- * Skeletons are a transformation of the input string;
- * Two strings are confusable if their skeletons are identical.
- * See Unicode UAX 39 for additional information.
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
- * @param s The input string whose skeleton will be computed.
- * @param length The length of the input string, expressed in 16 bit
+ * @param id The input identifier whose skeleton will be computed.
+ * @param length The length of the input identifier, expressed in 16 bit
* UTF-16 code units, or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
* @param destCapacity The length of the output buffer, in 16 bit units.
U_STABLE int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
uint32_t type,
- const UChar *s, int32_t length,
+ const UChar *id, int32_t length,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
/**
- * Get the "skeleton" for an identifier string.
- * Skeletons are a transformation of the input string;
- * Two strings are confusable if their skeletons are identical.
- * See Unicode UAX 39 for additional information.
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE. The two flags may be ORed.
- * @param s The UTF-8 format input string whose skeleton will be computed.
+ * @param id The UTF-8 format identifier whose skeleton will be computed.
* @param length The length of the input string, in bytes,
* or -1 if the string is zero terminated.
* @param dest The output buffer, to receive the skeleton string.
U_STABLE int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
- const char *s, int32_t length,
+ const char *id, int32_t length,
char *dest, int32_t destCapacity,
UErrorCode *status);
#if U_SHOW_CPLUSPLUS_API
/**
- * Get the "skeleton" for an identifier string.
- * Skeletons are a transformation of the input string;
- * Two strings are confusable if their skeletons are identical.
- * See Unicode UAX 39 for additional information.
+ * Get the "skeleton" for an identifier.
+ * Skeletons are a transformation of the input identifier;
+ * Two identifiers are confusable if their skeletons are identical.
+ * See Unicode UAX #39 for additional information.
*
* Using skeletons directly makes it possible to quickly check
* whether an identifier is confusable with any of some large
* The default is Mixed-Script, Lowercase.
* Allowed options are USPOOF_SINGLE_SCRIPT_CONFUSABLE and
* USPOOF_ANY_CASE_CONFUSABLE. The two flags may be ORed.
- * @param s The input string whose skeleton will be computed.
- * @param dest The output string, to receive the skeleton string.
+ * @param id The input identifier whose skeleton will be computed.
+ * @param dest The output identifier, to receive the skeleton string.
* @param status The error code, set if an error occurred while attempting to
* perform the check.
* @return A reference to the destination (skeleton) string.
U_I18N_API icu::UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
uint32_t type,
- const icu::UnicodeString &s,
+ const icu::UnicodeString &id,
icu::UnicodeString &dest,
UErrorCode *status);
#endif /* U_SHOW_CPLUSPLUS_API */
+/**
+ * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
+ * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
+ *
+ * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
+ * be deleted by the caller.
+ *
+ * @param status The error code, set if a problem occurs while creating the set.
+ *
+ * @draft ICU 51
+ */
+U_DRAFT const USet * U_EXPORT2
+uspoof_getInclusionSet(UErrorCode *status);
+
+/**
+ * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
+ * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
+ *
+ * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
+ * be deleted by the caller.
+ *
+ * @param status The error code, set if a problem occurs while creating the set.
+ *
+ * @draft ICU 51
+ */
+U_DRAFT const USet * U_EXPORT2
+uspoof_getRecommendedSet(UErrorCode *status);
+
+
+
+#if U_SHOW_CPLUSPLUS_API
+
+/**
+ * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
+ * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Inclusion_in_Identifiers
+ *
+ * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
+ * be deleted by the caller.
+ *
+ * @param status The error code, set if a problem occurs while creating the set.
+ *
+ * @draft ICU 51
+ */
+U_DRAFT const UnicodeSet * U_EXPORT2
+uspoof_getInclusionUnicodeSet(UErrorCode *status);
+
+/**
+ * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
+ * in Unicode UAX #31, http://www.unicode.org/reports/tr31/#Table_Recommended_Scripts
+ *
+ * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
+ * be deleted by the caller.
+ *
+ * @param status The error code, set if a problem occurs while creating the set.
+ *
+ * @draft ICU 51
+ */
+U_DRAFT const UnicodeSet * U_EXPORT2
+uspoof_getRecommendedUnicodeSet(UErrorCode *status);
+
+#endif /* U_SHOW_CPLUSPLUS_API */
+
/**
* Serialize the data for a spoof detector into a chunk of memory.
* The flattened spoof detection tables can later be used to efficiently
* instantiate a new Spoof Detector.
*
+ * The serialized spoof checker includes only the data compiled from the
+ * Unicode data tables by uspoof_openFromSource(); it does not include
+ * include any other state or configuration that may have been set.
+ *
* @param sc the Spoof Detector whose data is to be serialized.
* @param data a pointer to 32-bit-aligned memory to be filled with the data,
* can be NULL if capacity==0
/*
***************************************************************************
-* Copyright (C) 2008-2012, International Business Machines Corporation
+* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* file name: uspoof.cpp
* Unicode Spoof Detection
*/
#include "unicode/utypes.h"
+#include "unicode/normalizer2.h"
#include "unicode/uspoof.h"
-#include "unicode/unorm.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "cmemory.h"
-#include "uspoof_impl.h"
+#include "cstring.h"
+#include "identifier_info.h"
+#include "mutex.h"
+#include "scriptset.h"
#include "uassert.h"
+#include "ucln_in.h"
+#include "uspoof_impl.h"
+#include "umutex.h"
#if !UCONFIG_NO_NORMALIZATION
U_NAMESPACE_USE
+//
+// Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
+//
+static UnicodeSet *gInclusionSet = NULL;
+static UnicodeSet *gRecommendedSet = NULL;
+static const Normalizer2 *gNfdNormalizer = NULL;
+static UMutex gInitMutex = U_MUTEX_INITIALIZER;
+
+static UBool U_CALLCONV
+uspoof_cleanup(void) {
+ delete gInclusionSet;
+ gInclusionSet = NULL;
+ delete gRecommendedSet;
+ gRecommendedSet = NULL;
+ gNfdNormalizer = NULL;
+ return TRUE;
+}
+
+static void initializeStatics() {
+ Mutex m(&gInitMutex);
+ UErrorCode status = U_ZERO_ERROR;
+ if (gInclusionSet == NULL) {
+ gInclusionSet = new UnicodeSet(UnicodeString("["
+ "\\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]"), status);
+ gRecommendedSet = new UnicodeSet(UnicodeString("["
+ "[0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-"
+ "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E"
+ "\\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304"
+ "\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-"
+ "\\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339"
+ "\\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525"
+ "\\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655"
+ "\\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6"
+ "\\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D"
+ "\\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-"
+ "\\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-"
+ "\\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-"
+ "\\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-"
+ "\\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-"
+ "\\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-"
+ "\\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2"
+ "\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-"
+ "\\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-"
+ "\\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-"
+ "\\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F"
+ "\\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-"
+ "\\uFA29\\U00020000-"
+ "\\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]"), status);
+ gNfdNormalizer = Normalizer2::getNFDInstance(status);
+ }
+ ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
+ U_ASSERT(U_SUCCESS(status)); // TODO: remove after testing.
+ return;
+}
+
+
U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
+ initializeStatics();
SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
if (U_FAILURE(*status)) {
delete si;
si = NULL;
}
- return (USpoofChecker *)si;
+ return reinterpret_cast<USpoofChecker *>(si);
}
if (U_FAILURE(*status)) {
return NULL;
}
+ initializeStatics();
SpoofData *sd = new SpoofData(data, length, *status);
SpoofImpl *si = new SpoofImpl(sd, *status);
if (U_FAILURE(*status)) {
delete result;
result = NULL;
}
- return (USpoofChecker *)result;
+ return reinterpret_cast<USpoofChecker *>(result);
}
return This->fChecks;
}
+U_CAPI void U_EXPORT2
+uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) {
+ UErrorCode status = U_ZERO_ERROR;
+ SpoofImpl *This = SpoofImpl::validateThis(sc, status);
+ if (This != NULL) {
+ This->fRestrictionLevel = restrictionLevel;
+ }
+}
+
+U_CAPI URestrictionLevel U_EXPORT2
+uspoof_getRestrictionLevel(const USpoofChecker *sc) {
+ UErrorCode status = U_ZERO_ERROR;
+ const SpoofImpl *This = SpoofImpl::validateThis(sc, status);
+ if (This == NULL) {
+ return USPOOF_UNRESTRICTIVE;
+ }
+ return This->fRestrictionLevel;
+}
+
U_CAPI void U_EXPORT2
uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
U_CAPI const USet * U_EXPORT2
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
- return reinterpret_cast<const USet *>(result);
+ return result->toUSet();
}
U_CAPI const UnicodeSet * U_EXPORT2
U_CAPI void U_EXPORT2
uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
- const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
+ const UnicodeSet *set = UnicodeSet::fromUSet(chars);
uspoof_setAllowedUnicodeSet(sc, set, status);
}
U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
- const UChar *text, int32_t length,
+ const UChar *id, int32_t length,
int32_t *position,
UErrorCode *status) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
- if (length == -1) {
- // It's not worth the bother to handle nul terminated strings everywhere.
- // Just get the length and be done with it.
- length = u_strlen(text);
- }
-
- int32_t result = 0;
- int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?
-
- // A count of the number of non-Common or inherited scripts.
- // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
- // Share the computation when possible. scriptCount == -1 means that we haven't
- // done it yet.
- int32_t scriptCount = -1;
-
- if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
- scriptCount = This->scriptScan(text, length, failPos, *status);
- // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
- if ( scriptCount >= 2) {
- // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
- result |= USPOOF_SINGLE_SCRIPT;
- }
- }
-
- if (This->fChecks & USPOOF_CHAR_LIMIT) {
- int32_t i;
- UChar32 c;
- for (i=0; i<length ;) {
- U16_NEXT(text, i, length, c);
- if (!This->fAllowedCharsSet->contains(c)) {
- result |= USPOOF_CHAR_LIMIT;
- if (i < failPos) {
- failPos = i;
- }
- break;
- }
- }
- }
-
- if (This->fChecks &
- (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
- // These are the checks that need to be done on NFD input
- NFDBuffer normalizedInput(text, length, *status);
- const UChar *nfdText = normalizedInput.getBuffer();
- int32_t nfdLength = normalizedInput.getLength();
-
- if (This->fChecks & USPOOF_INVISIBLE) {
-
- // scan for more than one occurence of the same non-spacing mark
- // in a sequence of non-spacing marks.
- int32_t i;
- UChar32 c;
- UChar32 firstNonspacingMark = 0;
- UBool haveMultipleMarks = FALSE;
- UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
-
- for (i=0; i<nfdLength ;) {
- U16_NEXT(nfdText, i, nfdLength, c);
- if (u_charType(c) != U_NON_SPACING_MARK) {
- firstNonspacingMark = 0;
- if (haveMultipleMarks) {
- marksSeenSoFar.clear();
- haveMultipleMarks = FALSE;
- }
- continue;
- }
- if (firstNonspacingMark == 0) {
- firstNonspacingMark = c;
- continue;
- }
- if (!haveMultipleMarks) {
- marksSeenSoFar.add(firstNonspacingMark);
- haveMultipleMarks = TRUE;
- }
- if (marksSeenSoFar.contains(c)) {
- // report the error, and stop scanning.
- // No need to find more than the first failure.
- result |= USPOOF_INVISIBLE;
- failPos = i;
- // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
- // to give back to our caller is a position in the original input string.
- if (failPos > length) {
- failPos = length;
- }
- break;
- }
- marksSeenSoFar.add(c);
- }
- }
-
-
- if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
- // The basic test is the same for both whole and mixed script confusables.
- // Compute the set of scripts that every input character has a confusable in.
- // For this computation an input character is always considered to be
- // confusable with itself in its own script.
- // If the number of such scripts is two or more, and the input consisted of
- // characters all from a single script, we have a whole script confusable.
- // (The two scripts will be the original script and the one that is confusable)
- // If the number of such scripts >= one, and the original input contained characters from
- // more than one script, we have a mixed script confusable. (We can transform
- // some of the characters, and end up with a visually similar string all in
- // one script.)
-
- if (scriptCount == -1) {
- int32_t t;
- scriptCount = This->scriptScan(text, length, t, *status);
- }
-
- ScriptSet scripts;
- This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
- int32_t confusableScriptCount = scripts.countMembers();
- //printf("confusableScriptCount = %d\n", confusableScriptCount);
-
- if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
- confusableScriptCount >= 2 &&
- scriptCount == 1) {
- result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
- }
-
- if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
- confusableScriptCount >= 1 &&
- scriptCount > 1) {
- result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
- }
- }
- }
- if (position != NULL && failPos != 0x7fffffff) {
- *position = failPos;
- }
+ UnicodeString idStr((length == -1), id, length); // Aliasing constructor.
+ int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker *sc,
- const char *text, int32_t length,
+ const char *id, int32_t length,
int32_t *position,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return 0;
}
- UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar* text16 = stackBuf;
- int32_t len16;
-
- u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
- if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
- return 0;
- }
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
- if (text16 == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- *status = U_ZERO_ERROR;
- u_strFromUTF8(text16, len16+1, NULL, text, length, status);
- }
+ UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
+ int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
+ return result;
+}
+
- int32_t position16 = -1;
- int32_t result = uspoof_check(sc, text16, len16, &position16, status);
+U_CAPI int32_t U_EXPORT2
+uspoof_areConfusable(const USpoofChecker *sc,
+ const UChar *id1, int32_t length1,
+ const UChar *id2, int32_t length2,
+ UErrorCode *status) {
+ SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
-
- if (position16 > 0) {
- // Translate a UTF-16 based error position back to a UTF-8 offset.
- // u_strToUTF8() in preflight mode is an easy way to do it.
- U_ASSERT(position16 <= len16);
- u_strToUTF8(NULL, 0, position, text16, position16, status);
- if (position != NULL && *position > 0) {
- // position is the required buffer length from u_strToUTF8, which includes
- // space for a terminating NULL, which we don't want, hence the -1.
- *position -= 1;
- }
- *status = U_ZERO_ERROR; // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
- }
-
- if (text16 != stackBuf) {
- uprv_free(text16);
+ if (length1 < -1 || length2 < -1) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
}
- return result;
-
+
+ UnicodeString id1Str((length1==-1), id1, length1); // Aliasing constructor
+ UnicodeString id2Str((length2==-1), id2, length2); // Aliasing constructor
+ return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
}
-/* A convenience wrapper around the public uspoof_getSkeleton that handles
- * allocating a larger buffer than provided if the original is too small.
- */
-static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
- UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
- int32_t requiredCapacity = 0;
- UChar *buf = dest;
+U_CAPI int32_t U_EXPORT2
+uspoof_areConfusableUTF8(const USpoofChecker *sc,
+ const char *id1, int32_t length1,
+ const char *id2, int32_t length2,
+ UErrorCode *status) {
+ SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
- return NULL;
+ return 0;
}
- requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
- if (buf == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
- *status = U_ZERO_ERROR;
- uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
+ if (length1 < -1 || length2 < -1) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
}
- *outputLength = requiredCapacity;
- return buf;
+ UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1)));
+ UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2)));
+ int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
+ return results;
}
-
+
U_CAPI int32_t U_EXPORT2
-uspoof_areConfusable(const USpoofChecker *sc,
- const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- UErrorCode *status) {
+uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
+ const icu::UnicodeString &id1,
+ const icu::UnicodeString &id2,
+ UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
return 0;
}
int32_t flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
- UChar s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *s1Skeleton;
- int32_t s1SkeletonLength = 0;
-
- UChar s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *s2Skeleton;
- int32_t s2SkeletonLength = 0;
int32_t result = 0;
- int32_t t;
- int32_t s1ScriptCount = This->scriptScan(s1, length1, t, *status);
- int32_t s2ScriptCount = This->scriptScan(s2, length2, t, *status);
+ IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+ identifierInfo->setIdentifier(id1, *status);
+ int32_t id1ScriptCount = identifierInfo->getScriptCount();
+ identifierInfo->setIdentifier(id2, *status);
+ int32_t id2ScriptCount = identifierInfo->getScriptCount();
+ This->releaseIdentifierInfo(identifierInfo);
+ identifierInfo = NULL;
if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
- // Do the Single Script compare.
- if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
+ UnicodeString id1Skeleton;
+ UnicodeString id2Skeleton;
+ if (id1ScriptCount <= 1 && id2ScriptCount <= 1) {
flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
- s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
- sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
- s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
- sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
- if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
+ uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
+ uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
+ if (id1Skeleton == id2Skeleton) {
result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
}
- if (s1Skeleton != s1SkeletonBuf) {
- uprv_free(s1Skeleton);
- }
- if (s2Skeleton != s2SkeletonBuf) {
- uprv_free(s2Skeleton);
- }
}
}
return result;
}
- // Optimization for whole script confusables test: two identifiers are whole script confusable if
- // each is of a single script and they are mixed script confusable.
+ // Two identifiers are whole script confusable if each is of a single script
+ // and they are mixed script confusable.
UBool possiblyWholeScriptConfusables =
- s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
+ id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
//
// Mixed Script Check
// For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
// the mixed script table skeleton, which is what we want.
// The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
+ UnicodeString id1Skeleton;
+ UnicodeString id2Skeleton;
flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
- s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
- sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
- s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
- sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
- if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
+ uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
+ uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
+ if (id1Skeleton == id2Skeleton) {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
if (possiblyWholeScriptConfusables) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
}
- if (s1Skeleton != s1SkeletonBuf) {
- uprv_free(s1Skeleton);
- }
- if (s2Skeleton != s2SkeletonBuf) {
- uprv_free(s2Skeleton);
- }
}
return result;
}
-// Convenience function for converting a UTF-8 input to a UChar * string, including
-// reallocating a buffer when required. Parameters and their interpretation mostly
-// match u_strFromUTF8.
-static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
- const char *in, int32_t inLength, UErrorCode *status) {
- if (U_FAILURE(*status)) {
- return NULL;
- }
- UChar *dest = outBuf;
- u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
- if (dest == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
- *status = U_ZERO_ERROR;
- u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
- }
- return dest;
-}
-
-
U_CAPI int32_t U_EXPORT2
-uspoof_areConfusableUTF8(const USpoofChecker *sc,
- const char *s1, int32_t length1,
- const char *s2, int32_t length2,
- UErrorCode *status) {
-
- SpoofImpl::validateThis(sc, *status);
- if (U_FAILURE(*status)) {
+uspoof_checkUnicodeString(const USpoofChecker *sc,
+ const icu::UnicodeString &id,
+ int32_t *position,
+ UErrorCode *status) {
+ const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
+ if (This == NULL) {
return 0;
}
+ int32_t result = 0;
- UChar s1Buf[USPOOF_STACK_BUFFER_SIZE];
- int32_t lengthS1U;
- UChar *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
+ IdentifierInfo *identifierInfo = NULL;
+ if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
+ identifierInfo = This->getIdentifierInfo(*status);
+ if (U_FAILURE(*status)) {
+ goto cleanupAndReturn;
+ }
+ identifierInfo->setIdentifier(id, *status);
+ identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
+ }
- UChar s2Buf[USPOOF_STACK_BUFFER_SIZE];
- int32_t lengthS2U;
- UChar *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
- int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
-
- if (s1U != s1Buf) {
- uprv_free(s1U);
- }
- if (s2U != s2Buf) {
- uprv_free(s2U);
+ if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
+ URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
+ if (idRestrictionLevel > This->fRestrictionLevel) {
+ result |= USPOOF_RESTRICTION_LEVEL;
+ }
+ if (This->fChecks & USPOOF_AUX_INFO) {
+ result |= idRestrictionLevel;
+ }
}
- return results;
-}
-
-U_CAPI int32_t U_EXPORT2
-uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
- const icu::UnicodeString &s1,
- const icu::UnicodeString &s2,
- UErrorCode *status) {
+ if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
+ const UnicodeSet *numerics = identifierInfo->getNumerics();
+ if (numerics->size() > 1) {
+ result |= USPOOF_MIXED_NUMBERS;
+ }
- const UChar *u1 = s1.getBuffer();
- int32_t length1 = s1.length();
- const UChar *u2 = s2.getBuffer();
- int32_t length2 = s2.length();
+ // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
+ // We have no easy way to do the same in C.
+ // if (checkResult != null) {
+ // checkResult.numerics = numerics;
+ // }
+ }
- int32_t results = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
- return results;
-}
+ if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
+ int32_t i;
+ UChar32 c;
+ int32_t length = id.length();
+ for (i=0; i<length ;) {
+ c = id.char32At(i);
+ i += U16_LENGTH(c);
+ if (!This->fAllowedCharsSet->contains(c)) {
+ result |= USPOOF_CHAR_LIMIT;
+ break;
+ }
+ }
+ }
+ if (This->fChecks &
+ (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
+ // These are the checks that need to be done on NFD input
+ UnicodeString nfdText;
+ gNfdNormalizer->normalize(id, nfdText, *status);
+ int32_t nfdLength = nfdText.length();
+ if (This->fChecks & USPOOF_INVISIBLE) {
+
+ // scan for more than one occurence of the same non-spacing mark
+ // in a sequence of non-spacing marks.
+ int32_t i;
+ UChar32 c;
+ UChar32 firstNonspacingMark = 0;
+ UBool haveMultipleMarks = FALSE;
+ UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence.
+
+ for (i=0; i<nfdLength ;) {
+ c = nfdText.char32At(i);
+ i += U16_LENGTH(c);
+ if (u_charType(c) != U_NON_SPACING_MARK) {
+ firstNonspacingMark = 0;
+ if (haveMultipleMarks) {
+ marksSeenSoFar.clear();
+ haveMultipleMarks = FALSE;
+ }
+ continue;
+ }
+ if (firstNonspacingMark == 0) {
+ firstNonspacingMark = c;
+ continue;
+ }
+ if (!haveMultipleMarks) {
+ marksSeenSoFar.add(firstNonspacingMark);
+ haveMultipleMarks = TRUE;
+ }
+ if (marksSeenSoFar.contains(c)) {
+ // report the error, and stop scanning.
+ // No need to find more than the first failure.
+ result |= USPOOF_INVISIBLE;
+ break;
+ }
+ marksSeenSoFar.add(c);
+ }
+ }
+
+
+ if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
+ // The basic test is the same for both whole and mixed script confusables.
+ // Compute the set of scripts that every input character has a confusable in.
+ // For this computation an input character is always considered to be
+ // confusable with itself in its own script.
+ //
+ // If the number of such scripts is two or more, and the input consisted of
+ // characters all from a single script, we have a whole script confusable.
+ // (The two scripts will be the original script and the one that is confusable)
+ //
+ // If the number of such scripts >= one, and the original input contained characters from
+ // more than one script, we have a mixed script confusable. (We can transform
+ // some of the characters, and end up with a visually similar string all in
+ // one script.)
+
+ if (identifierInfo == NULL) {
+ identifierInfo = This->getIdentifierInfo(*status);
+ if (U_FAILURE(*status)) {
+ goto cleanupAndReturn;
+ }
+ identifierInfo->setIdentifier(id, *status);
+ }
-U_CAPI int32_t U_EXPORT2
-uspoof_checkUnicodeString(const USpoofChecker *sc,
- const icu::UnicodeString &text,
- int32_t *position,
- UErrorCode *status) {
- int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
+ int32_t scriptCount = identifierInfo->getScriptCount();
+
+ ScriptSet scripts;
+ This->wholeScriptCheck(nfdText, &scripts, *status);
+ int32_t confusableScriptCount = scripts.countMembers();
+ //printf("confusableScriptCount = %d\n", confusableScriptCount);
+
+ if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
+ confusableScriptCount >= 2 &&
+ scriptCount == 1) {
+ result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
+ }
+
+ if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
+ confusableScriptCount >= 1 &&
+ scriptCount > 1) {
+ result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
+ }
+ }
+ }
+
+cleanupAndReturn:
+ This->releaseIdentifierInfo(identifierInfo);
+ if (position != NULL) {
+ *position = 0;
+ }
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
uint32_t type,
- const UChar *s, int32_t length,
+ const UChar *id, int32_t length,
UChar *dest, int32_t destCapacity,
UErrorCode *status) {
- // TODO: this function could be sped up a bit
- // Skip the input normalization when not needed, work from callers data.
- // Put the initial skeleton straight into the caller's destination buffer.
- // It probably won't need normalization.
- // But these would make the structure more complicated.
-
- const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
+ SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
- if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
- (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
+ if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
+ UnicodeString idStr((length==-1), id, length); // Aliasing constructor
+ UnicodeString destStr;
+ uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status);
+ destStr.extract(dest, destCapacity, *status);
+ return destStr.length();
+}
+
+
+
+U_I18N_API UnicodeString & U_EXPORT2
+uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
+ uint32_t type,
+ const UnicodeString &id,
+ UnicodeString &dest,
+ UErrorCode *status) {
+ const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
+ if (U_FAILURE(*status)) {
+ return dest;
+ }
+
int32_t tableMask = 0;
switch (type) {
case 0:
break;
default:
*status = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
- // NFD transform of the user supplied input
-
- UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *nfdInput = nfdStackBuf;
- int32_t normalizedLen = unorm_normalize(
- s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
- if (nfdInput == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- *status = U_ZERO_ERROR;
- normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
- nfdInput, normalizedLen+1, status);
- }
- if (U_FAILURE(*status)) {
- if (nfdInput != nfdStackBuf) {
- uprv_free(nfdInput);
- }
- return 0;
+ return dest;
}
- // buffer to hold the Unicode defined skeleton mappings for a single code point
- UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
+ UnicodeString nfdId;
+ gNfdNormalizer->normalize(id, nfdId, *status);
// Apply the skeleton mapping to the NFD normalized input string
// Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
int32_t inputIndex = 0;
UnicodeString skelStr;
- while (inputIndex < normalizedLen) {
- UChar32 c;
- U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
- int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
- skelStr.append(buf, replaceLen);
- }
-
- if (nfdInput != nfdStackBuf) {
- uprv_free(nfdInput);
- }
-
- const UChar *result = skelStr.getBuffer();
- int32_t resultLen = skelStr.length();
- UChar *normedResult = NULL;
-
- // Check the skeleton for NFD, normalize it if needed.
- // Unnormalized results should be very rare.
- if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
- normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
- normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
- if (normedResult == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return 0;
- }
- *status = U_ZERO_ERROR;
- unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
- result = normedResult;
- resultLen = normalizedLen;
- }
-
- // Copy the skeleton to the caller's buffer
- if (U_SUCCESS(*status)) {
- if (destCapacity == 0 || resultLen > destCapacity) {
- *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
- } else {
- u_memcpy(dest, result, resultLen);
- if (destCapacity > resultLen) {
- dest[resultLen] = 0;
- } else {
- *status = U_STRING_NOT_TERMINATED_WARNING;
- }
- }
- }
- uprv_free(normedResult);
- return resultLen;
-}
-
-
-
-U_I18N_API UnicodeString & U_EXPORT2
-uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
- uint32_t type,
- const UnicodeString &s,
- UnicodeString &dest,
- UErrorCode *status) {
- if (U_FAILURE(*status)) {
- return dest;
- }
- dest.remove();
-
- const UChar *str = s.getBuffer();
- int32_t strLen = s.length();
- UChar smallBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *buf = smallBuf;
- int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
- if (buf == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return dest;
- }
- *status = U_ZERO_ERROR;
- uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
- }
- if (U_SUCCESS(*status)) {
- dest.setTo(buf, outputSize);
+ int32_t normalizedLen = nfdId.length();
+ for (inputIndex=0; inputIndex < normalizedLen; ) {
+ UChar32 c = nfdId.char32At(inputIndex);
+ inputIndex += U16_LENGTH(c);
+ This->confusableLookup(c, tableMask, skelStr);
}
- if (buf != smallBuf) {
- uprv_free(buf);
- }
+ gNfdNormalizer->normalize(skelStr, dest, *status);
return dest;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
- const char *s, int32_t length,
+ const char *id, int32_t length,
char *dest, int32_t destCapacity,
UErrorCode *status) {
- // Lacking a UTF-8 normalization API, just converting the input to
- // UTF-16 seems as good an approach as any. In typical use, input will
- // be an identifier, which is to say not too long for stack buffers.
+ SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
- // Buffers for the UChar form of the input and skeleton strings.
- UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *inBuf = smallInBuf;
- UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
- UChar *outBuf = smallOutBuf;
-
- int32_t lengthInUChars = 0;
- int32_t skelLengthInUChars = 0;
- int32_t skelLengthInUTF8 = 0;
-
- u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
- s, length, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
- if (inBuf == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- *status = U_ZERO_ERROR;
- u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
- s, length, status);
- }
-
- skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
- outBuf, USPOOF_STACK_BUFFER_SIZE, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
- if (outBuf == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
- *status = U_ZERO_ERROR;
- skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
- outBuf, skelLengthInUChars+1, status);
+ if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
}
- u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
- outBuf, skelLengthInUChars, status);
-
- cleanup:
- if (inBuf != smallInBuf) {
- uprv_free(inBuf);
- }
- if (outBuf != smallOutBuf) {
- uprv_free(outBuf);
+ UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
+ UnicodeString destStr;
+ uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
+ if (U_FAILURE(*status)) {
+ return 0;
}
- return skelLengthInUTF8;
+
+ int32_t lengthInUTF8 = 0;
+ u_strToUTF8(dest, destCapacity, &lengthInUTF8,
+ destStr.getBuffer(), destStr.length(), status);
+ return lengthInUTF8;
}
return dataSize;
}
-#endif
+U_CAPI const USet * U_EXPORT2
+uspoof_getInclusionSet(UErrorCode *) {
+ initializeStatics();
+ return gInclusionSet->toUSet();
+}
+
+U_CAPI const USet * U_EXPORT2
+uspoof_getRecommendedSet(UErrorCode *) {
+ initializeStatics();
+ return gRecommendedSet->toUSet();
+}
+
+U_I18N_API const UnicodeSet * U_EXPORT2
+uspoof_getInclusionUnicodeSet(UErrorCode *) {
+ initializeStatics();
+ return gInclusionSet;
+}
+
+U_I18N_API const UnicodeSet * U_EXPORT2
+uspoof_getRecommendedUnicodeSet(UErrorCode *) {
+ initializeStatics();
+ return gRecommendedSet;
+}
+
+
+
+#endif // !UCONFIG_NO_NORMALIZATION
/*
**********************************************************************
-* Copyright (C) 2008-2011, International Business Machines
+* Copyright (C) 2008-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
-#include "unicode/unorm.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
+#include "identifier_info.h"
+#include "scriptset.h"
#include "udatamem.h"
#include "umutex.h"
#include "udataswp.h"
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
- fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) {
+ fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
+ fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
if (U_FAILURE(status)) {
return;
}
- fMagic = USPOOF_MAGIC;
fSpoofData = data;
- fChecks = USPOOF_ALL_CHECKS;
+ fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
+
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
- if (allowedCharsSet == NULL || fAllowedLocales == NULL) {
+ allowedCharsSet->freeze();
+ fAllowedCharsSet = allowedCharsSet;
+ fAllowedLocales = uprv_strdup("");
+ if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- allowedCharsSet->freeze();
- fAllowedCharsSet = allowedCharsSet;
+ fMagic = USPOOF_MAGIC;
}
-SpoofImpl::SpoofImpl() {
- fMagic = USPOOF_MAGIC;
- fSpoofData = NULL;
- fChecks = USPOOF_ALL_CHECKS;
+SpoofImpl::SpoofImpl() :
+ fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
+ fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
+ fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
- fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
+ fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
+ fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
if (U_FAILURE(status)) {
return;
}
status = U_MEMORY_ALLOCATION_ERROR;
}
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
+ fRestrictionLevel = src.fRestrictionLevel;
}
SpoofImpl::~SpoofImpl() {
}
delete fAllowedCharsSet;
uprv_free((void *)fAllowedLocales);
+ delete fCachedIdentifierInfo;
}
//
// implementation.
//
// Given a source character, produce the corresponding
-// replacement character(s)
+// replacement character(s), appending them to the dest string.
//
//---------------------------------------------------------------------------------------
-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
+int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
// Binary search the spoof data key table for the inChar
int32_t *low = fSpoofData->fCFUKeys;
if (inChar != midc) {
// Char not found. It maps to itself.
int i = 0;
- U16_APPEND_UNSAFE(destBuf, i, inChar)
+ dest.append(inChar);
return i;
}
foundChar:
// No key entry for this char & table.
// The input char maps to itself.
int i = 0;
- U16_APPEND_UNSAFE(destBuf, i, inChar)
+ dest.append(inChar);
return i;
}
// an index into the string table (for longer strings)
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
if (stringLen == 1) {
- destBuf[0] = value;
+ dest.append((UChar)value);
return 1;
}
U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
UChar *src = &fSpoofData->fCFUStrings[value];
- for (ix=0; ix<stringLen; ix++) {
- destBuf[ix] = src[ix];
- }
+ dest.append(src, stringLen);
return stringLen;
}
//
//---------------------------------------------------------------------------------------
void SpoofImpl::wholeScriptCheck(
- const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
-
- int32_t inputIdx = 0;
- UChar32 c;
+ const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
UTrie2 *table =
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
result->setAll();
- while (inputIdx < length) {
- U16_NEXT(text, inputIdx, length, c);
+ int32_t length = text.length();
+ for (int32_t inputIdx=0; inputIdx < length;) {
+ UChar32 c = text.char32At(inputIdx);
+ inputIdx += U16_LENGTH(c);
uint32_t index = utrie2_get32(table, c);
if (index == 0) {
// No confusables in another script for this char.
// Until then, grab the script from the char and intersect it with the set.
UScriptCode cpScript = uscript_getScript(c, &status);
U_ASSERT(cpScript > USCRIPT_INHERITED);
- result->intersect(cpScript);
+ result->intersect(cpScript, status);
} else if (index == 1) {
// Script == Common or Inherited. Nothing to do.
} else {
}
-int32_t SpoofImpl::scriptScan
- (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
- if (U_FAILURE(status)) {
- return 0;
- }
- int32_t inputIdx = 0;
- UChar32 c;
- int32_t scriptCount = 0;
- UScriptCode lastScript = USCRIPT_INVALID_CODE;
- UScriptCode sc = USCRIPT_INVALID_CODE;
- while ((inputIdx < length || length == -1) && scriptCount < 2) {
- U16_NEXT(text, inputIdx, length, c);
- if (c == 0 && length == -1) {
- break;
- }
- sc = uscript_getScript(c, &status);
- if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
- continue;
- }
-
- // Temporary fix: fold Japanese Hiragana and Katakana into Han.
- // Names are allowed to mix these scripts.
- // A more general solution will follow later for characters that are
- // used with multiple scripts.
-
- if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) {
- sc = USCRIPT_HAN;
- }
-
- if (sc != lastScript) {
- scriptCount++;
- lastScript = sc;
- }
- }
- if (scriptCount == 2) {
- pos = inputIdx;
- }
- return scriptCount;
-}
-
-
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
return (UChar32)val;
}
+// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
+// Maintain a one-element cache, which is sufficient to avoid repeatedly
+// creating new ones unless we get multi-thread concurrency in spoof
+// check operations, which should be statistically uncommon.
+
+// These functions are used in place of new & delete of an IdentifierInfo.
+// They will recycle the IdentifierInfo when possible.
+// They are logically const, and used within const functions that must be thread safe.
+IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
+ IdentifierInfo *returnIdInfo = NULL;
+ if (U_FAILURE(status)) {
+ return returnIdInfo;
+ }
+ SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
+ {
+ Mutex m;
+ returnIdInfo = nonConstThis->fCachedIdentifierInfo;
+ nonConstThis->fCachedIdentifierInfo = NULL;
+ }
+ if (returnIdInfo == NULL) {
+ returnIdInfo = new IdentifierInfo(status);
+ if (U_SUCCESS(status) && returnIdInfo == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ }
+ if (U_FAILURE(status) && returnIdInfo != NULL) {
+ delete returnIdInfo;
+ returnIdInfo = NULL;
+ }
+ }
+ return returnIdInfo;
+}
+
+
+void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
+ if (idInfo != NULL) {
+ SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
+ {
+ Mutex m;
+ if (nonConstThis->fCachedIdentifierInfo == NULL) {
+ nonConstThis->fCachedIdentifierInfo = idInfo;
+ idInfo = NULL;
+ }
+ }
+ delete idInfo;
+ }
+};
+
+
//----------------------------------------------------------------------------------------------
}
-//----------------------------------------------------------------------------
-//
-// ScriptSet implementation
-//
-//----------------------------------------------------------------------------
-ScriptSet::ScriptSet() {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] = 0;
- }
-}
-
-ScriptSet::~ScriptSet() {
-}
-
-UBool ScriptSet::operator == (const ScriptSet &other) {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- if (bits[i] != other.bits[i]) {
- return FALSE;
- }
- }
- return TRUE;
-}
-
-void ScriptSet::Union(UScriptCode script) {
- uint32_t index = script / 32;
- uint32_t bit = 1 << (script & 31);
- U_ASSERT(index < sizeof(bits)*4);
- bits[index] |= bit;
-}
-
-
-void ScriptSet::Union(const ScriptSet &other) {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] |= other.bits[i];
- }
-}
-
-void ScriptSet::intersect(const ScriptSet &other) {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] &= other.bits[i];
- }
-}
-
-void ScriptSet::intersect(UScriptCode script) {
- uint32_t index = script / 32;
- uint32_t bit = 1 << (script & 31);
- U_ASSERT(index < sizeof(bits)*4);
- uint32_t i;
- for (i=0; i<index; i++) {
- bits[i] = 0;
- }
- bits[index] &= bit;
- for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] = 0;
- }
-}
-
-
-ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] = other.bits[i];
- }
- return *this;
-}
-
-
-void ScriptSet::setAll() {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] = 0xffffffffu;
- }
-}
-
-
-void ScriptSet::resetAll() {
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- bits[i] = 0;
- }
-}
-
-int32_t ScriptSet::countMembers() {
- // This bit counter is good for sparse numbers of '1's, which is
- // very much the case that we will usually have.
- int32_t count = 0;
- for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
- uint32_t x = bits[i];
- while (x > 0) {
- count++;
- x &= (x - 1); // and off the least significant one bit.
- }
- }
- return count;
-}
-
-
-
-//-----------------------------------------------------------------------------
-//
-// NFDBuffer Implementation.
-//
-//-----------------------------------------------------------------------------
-
-NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
- fNormalizedText = NULL;
- fNormalizedTextLength = 0;
- fOriginalText = text;
- if (U_FAILURE(status)) {
- return;
- }
- fNormalizedText = fSmallBuf;
- fNormalizedTextLength = unorm_normalize(
- text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
- if (status == U_BUFFER_OVERFLOW_ERROR) {
- status = U_ZERO_ERROR;
- fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
- if (fNormalizedText == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- } else {
- fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
- fNormalizedText, fNormalizedTextLength+1, &status);
- }
- }
-}
-
-
-NFDBuffer::~NFDBuffer() {
- if (fNormalizedText != fSmallBuf) {
- uprv_free(fNormalizedText);
- }
- fNormalizedText = 0;
-}
-
-const UChar *NFDBuffer::getBuffer() {
- return fNormalizedText;
-}
-
-int32_t NFDBuffer::getLength() {
- return fNormalizedTextLength;
-}
-
-
-
-
-
U_NAMESPACE_END
U_NAMESPACE_USE
/*
***************************************************************************
-* Copyright (C) 2008-2011, International Business Machines Corporation
+* Copyright (C) 2008-2013, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
-#include "utrie2.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"
+#include "utrie2.h"
#if !UCONFIG_NO_NORMALIZATION
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
+class IdentifierInfo;
+class ScriptSet;
class SpoofData;
struct SpoofDataHeader;
struct SpoofStringLengthsElement;
-class ScriptSet;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
* @return The length in UTF-16 code units of the substition string.
*/
- int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;
+ int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &destBuf) const;
/** Set and Get AllowedLocales, implementations of the corresponding API */
void setAllowedLocales(const char *localesList, UErrorCode &status);
// Return the test bit flag to be ORed into the eventual user return value
// if a Spoof opportunity is detected.
void wholeScriptCheck(
- const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
+ const UnicodeString &text, ScriptSet *result, UErrorCode &status) const;
- /** Scan a string to determine how many scripts it includes.
- * Ignore characters with script=Common and scirpt=Inherited.
- * @param text The UChar text to be scanned
- * @param length The length of the input text, -1 for nul termintated.
- * @param pos An out parameter, set to the first input postion at which
- * a second script was encountered, ignoring Common and Inherited.
- * @param status For errors.
- * @return the number of (non-common,inherited) scripts encountered,
- * clipped to a max of two.
- */
- int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;
-
static UClassID U_EXPORT2 getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
+ // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
+ // Maintain a one-element cache, which is sufficient to avoid repeatedly
+ // creating new ones unless we get multi-thread concurrency in spoof
+ // check operations, which should be statistically uncommon.
+ IdentifierInfo *getIdentifierInfo(UErrorCode &status) const;
+ void releaseIdentifierInfo(IdentifierInfo *idInfo) const;
+
//
// Data Members
//
// for this Spoof Checker. Defaults to all chars.
const char *fAllowedLocales; // The list of allowed locales.
+ URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier.
+
+ IdentifierInfo *fCachedIdentifierInfo; // Do not use directly. See getIdentifierInfo().:w
};
};
-//-------------------------------------------------------------------------------
-//
-// ScriptSet - Wrapper class for the Script code bit sets that are part of the
-// whole script confusable data.
-//
-// This class is used both at data build and at run time.
-// The constructor is only used at build time.
-// At run time, just point at the prebuilt data and go.
-//
-//-------------------------------------------------------------------------------
-class ScriptSet: public UMemory {
- public:
- ScriptSet();
- ~ScriptSet();
-
- UBool operator == (const ScriptSet &other);
- ScriptSet & operator = (const ScriptSet &other);
-
- void Union(const ScriptSet &other);
- void Union(UScriptCode script);
- void intersect(const ScriptSet &other);
- void intersect(UScriptCode script);
- void setAll();
- void resetAll();
- int32_t countMembers();
-
- private:
- uint32_t bits[6];
-};
-
-
-
-
-//-------------------------------------------------------------------------------
-//
-// NFDBuffer A little class to handle the NFD normalization that is
-// needed on incoming identifiers to be checked.
-// Takes care of buffer handling and normalization
-//
-// Instances of this class are intended to be stack-allocated.
-//
-// TODO: how to map position offsets back to user values?
-//
-//--------------------------------------------------------------------------------
-class NFDBuffer: public UMemory {
-public:
- NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
- ~NFDBuffer();
- const UChar *getBuffer();
- int32_t getLength();
-
- private:
- const UChar *fOriginalText;
- UChar *fNormalizedText;
- int32_t fNormalizedTextLength;
- UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
-};
-
-
-
-
//-------------------------------------------------------------------------------------
//
/*
******************************************************************************
*
-* Copyright (C) 2008-2012, International Business Machines
+* Copyright (C) 2008-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
+#include "scriptset.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
scriptSets->addElement(bsset, status);
utrie2_set32(table, cp, setIndex, &status);
}
- bsset->sset->Union(targScript);
- bsset->sset->Union(srcScript);
+ bsset->sset->set(targScript, status);
+ bsset->sset->set(srcScript, status);
if (U_FAILURE(status)) {
goto cleanup;
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 2009-2012, International Business Machines Corporation and
+ * Copyright (c) 2009-2013, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
TEST_ASSERT_SUCCESS(status);
uset_close(tmpSet);
- /* Latin Identifier should now fail; other non-latin test cases should still be OK */
+ /* Latin Identifier should now fail; other non-latin test cases should still be OK
+ * Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE
+ * which will give us a USPOOF_RESTRICTION_LEVEL failure.
+ */
checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
+ TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults);
checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
TEST_ASSERT_SUCCESS(status);
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
- TEST_ASSERT_EQ(666, position);
+ TEST_ASSERT_EQ(0, position);
u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
TEST_ASSERT_SUCCESS(status);
checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
- TEST_ASSERT_EQ(2, position);
+ TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
/*
**********************************************************************
-* Copyright (C) 2011, International Business Machines Corporation
+* Copyright (C) 2011-2013, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
#include "itspoof.h"
-#include "unicode/uspoof.h"
-#include "unicode/unistr.h"
-#include "unicode/regex.h"
+
#include "unicode/normlzr.h"
+#include "unicode/regex.h"
+#include "unicode/unistr.h"
+#include "unicode/uscript.h"
+#include "unicode/uspoof.h"
+
#include "cstring.h"
+#include "identifier_info.h"
+#include "scriptset.h"
+#include "uhash.h"
+
#include <stdlib.h>
#include <stdio.h>
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
+#define TEST_ASSERT_MSG(expr, msg) {if ((expr)==FALSE) { \
+ errln("Test Failure at file %s, line %d, %s: \"%s\" is false.\n", __FILE__, __LINE__, msg, #expr);};}
+
#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
+#define LENGTHOF(array) ((int32_t)(sizeof(array)/sizeof((array)[0])))
+
/*
* TEST_SETUP and TEST_TEARDOWN
* macros to handle the boilerplate around setting up test case.
testSpoofAPI();
}
break;
- case 1:
+ case 1:
name = "TestSkeleton";
if (exec) {
testSkeleton();
}
break;
- case 2:
+ case 2:
name = "TestAreConfusable";
if (exec) {
testAreConfusable();
}
break;
- case 3:
+ case 3:
name = "TestInvisible";
if (exec) {
testInvisible();
}
break;
- case 4:
+ case 4:
name = "testConfData";
if (exec) {
testConfData();
}
break;
- case 5:
+ case 5:
name = "testBug8654";
if (exec) {
testBug8654();
}
break;
- default: name=""; break;
+ case 6:
+ name = "testIdentifierInfo";
+ if (exec) {
+ testIdentifierInfo();
+ }
+ break;
+ case 7:
+ name = "testScriptSet";
+ if (exec) {
+ testScriptSet();
+ }
+ break;
+ case 8:
+ name = "testRestrictionLevel";
+ if (exec) {
+ testRestrictionLevel();
+ }
+ break;
+ case 9:
+ name = "testMixedNumbers";
+ if (exec) {
+ testMixedNumbers();
+ }
+ break;
+
+
+ default: name=""; break;
}
}
int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT_EQ(0, checkResults);
- TEST_ASSERT_EQ(666, position);
+ TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
TEST_SETUP
int32_t position = -42;
TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT(position == -42);
+ TEST_ASSERT(0 == position);
UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(7, position);
+ TEST_ASSERT_EQ(0, position);
// Two acute accents, one from the composed a with acute accent, \u00e1,
// and one separate.
UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(7, position);
+ TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
}
int32_t position = -42;
TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s, &position, &status) & USPOOF_INVISIBLE );
TEST_ASSERT_SUCCESS(status);
- TEST_ASSERT_EQ(3, position);
+ TEST_ASSERT_EQ(0, position);
TEST_TEARDOWN;
}
}
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
+// testIdentifierInfo. Note that IdentifierInfo is not public ICU API at this time
+void IntlTestSpoof::testIdentifierInfo() {
+ UErrorCode status = U_ZERO_ERROR;
+ ScriptSet bitset12; bitset12.set(USCRIPT_LATIN, status).set(USCRIPT_HANGUL, status);
+ ScriptSet bitset2; bitset2.set(USCRIPT_HANGUL, status);
+ TEST_ASSERT(bitset12.contains(bitset2));
+ TEST_ASSERT(bitset12.contains(bitset12));
+ TEST_ASSERT(!bitset2.contains(bitset12));
+
+ ScriptSet arabSet; arabSet.set(USCRIPT_ARABIC, status);
+ ScriptSet latinSet; latinSet.set(USCRIPT_LATIN, status);
+ UElement arabEl; arabEl.pointer = &arabSet;
+ UElement latinEl; latinEl.pointer = &latinSet;
+ TEST_ASSERT(uhash_compareScriptSet(arabEl, latinEl) < 0);
+ TEST_ASSERT(uhash_compareScriptSet(latinEl, arabEl) > 0);
+
+ UnicodeString scriptString;
+ bitset12.displayScripts(scriptString);
+ TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang Latn") == scriptString);
+
+ status = U_ZERO_ERROR;
+ UHashtable *alternates = uhash_open(uhash_hashScriptSet ,uhash_compareScriptSet, NULL, &status);
+ uhash_puti(alternates, &bitset12, 1, &status);
+ uhash_puti(alternates, &bitset2, 1, &status);
+ UnicodeString alternatesString;
+ IdentifierInfo::displayAlternates(alternatesString, alternates, status);
+ TEST_ASSERT(UNICODE_STRING_SIMPLE("Hang; Hang Latn") == alternatesString);
+ TEST_ASSERT_SUCCESS(status);
+
+ status = U_ZERO_ERROR;
+ ScriptSet tScriptSet;
+ tScriptSet.parseScripts(scriptString, status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(bitset12 == tScriptSet);
+ UnicodeString ss;
+ ss.remove();
+ uhash_close(alternates);
+
+ struct Test {
+ const char *fTestString;
+ URestrictionLevel fRestrictionLevel;
+ const char *fNumerics;
+ const char *fScripts;
+ const char *fAlternates;
+ const char *fCommonAlternates;
+ } tests[] = {
+ {"\\u0061\\u2665", USPOOF_UNRESTRICTIVE, "[]", "Latn", "", ""},
+ {"\\u0061\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
+ {"\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn", "Hira Kana", "Hira Kana"},
+ {"\\u0061\\u30FC\\u3006\\u30A2", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
+ {"\\u30A2\\u0061\\u30FC\\u3006", USPOOF_HIGHLY_RESTRICTIVE, "[]", "Latn Kana", "", ""},
+ {"\\u0061\\u0031\\u0661", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660]", "Latn", "Arab Thaa", "Arab Thaa"},
+ {"\\u0061\\u0031\\u0661\\u06F1", USPOOF_UNRESTRICTIVE, "[\\u0030\\u0660\\u06F0]", "Latn Arab", "", ""},
+ {"\\u0661\\u30FC\\u3006\\u0061\\u30A2\\u0031\\u0967\\u06F1", USPOOF_UNRESTRICTIVE,
+ "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""},
+ {"\\u0061\\u30A2\\u30FC\\u3006\\u0031\\u0967\\u0661\\u06F1", USPOOF_UNRESTRICTIVE,
+ "[\\u0030\\u0660\\u06F0\\u0966]", "Latn Kana Arab Deva", "", ""}
+ };
+
+ int testNum;
+ for (testNum = 0; testNum < LENGTHOF(tests); testNum++) {
+ char testNumStr[40];
+ sprintf(testNumStr, "testNum = %d", testNum);
+ Test &test = tests[testNum];
+ status = U_ZERO_ERROR;
+ UnicodeString testString(test.fTestString); // Note: may do charset conversion.
+ testString = testString.unescape();
+ IdentifierInfo idInfo(status);
+ TEST_ASSERT_SUCCESS(status);
+ idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
+ idInfo.setIdentifier(testString, status);
+ TEST_ASSERT_MSG(*idInfo.getIdentifier() == testString, testNumStr);
+
+ URestrictionLevel restrictionLevel = test.fRestrictionLevel;
+ TEST_ASSERT_MSG(restrictionLevel == idInfo.getRestrictionLevel(status), testNumStr);
+
+ status = U_ZERO_ERROR;
+ UnicodeSet numerics(UnicodeString(test.fNumerics).unescape(), status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_MSG(numerics == *idInfo.getNumerics(), testNumStr);
+
+ ScriptSet scripts;
+ scripts.parseScripts(UnicodeString(test.fScripts), status);
+ TEST_ASSERT_MSG(scripts == *idInfo.getScripts(), testNumStr);
+
+ UnicodeString alternatesStr;
+ IdentifierInfo::displayAlternates(alternatesStr, idInfo.getAlternates(), status);
+ TEST_ASSERT_MSG(UnicodeString(test.fAlternates) == alternatesStr, testNumStr);
+
+ ScriptSet commonAlternates;
+ commonAlternates.parseScripts(UnicodeString(test.fCommonAlternates), status);
+ TEST_ASSERT_MSG(commonAlternates == *idInfo.getCommonAmongAlternates(), testNumStr);
+ }
+
+ // Test of getScriptCount()
+ // Script and or Script Extension for chars used in the tests
+ // \\u3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
+ // \\uA838 ; Deva Gujr Guru Kthi Takr # Sc NORTH INDIC RUPEE MARK
+ // \\u0951 ; Deva Latn # Mn DEVANAGARI STRESS SIGN UDATTA
+ //
+ // \\u0370 ; Greek # L GREEK CAPITAL LETTER HETA
+ // \\u0481 ; Cyrillic # L& CYRILLIC SMALL LETTER KOPPA
+ // \\u0904 ; Devanagari # Lo DEVANAGARI LETTER SHORT A
+ // \\u3041 ; Hiragana # Lo HIRAGANA LETTER SMALL A
+ // 1234 ; Common # ascii digits
+ // \\u0300 ; Inherited # Mn COMBINING GRAVE ACCENT
+
+ struct ScriptTest {
+ const char *fTestString;
+ int32_t fScriptCount;
+ } scriptTests[] = {
+ {"Hello", 1},
+ {"Hello\\u0370", 2},
+ {"1234", 0},
+ {"Hello1234\\u0300", 1}, // Common and Inherited are ignored.
+ {"\\u0030", 0},
+ {"abc\\u0951", 1},
+ {"abc\\u3013", 2},
+ {"\\uA838\\u0951", 1}, // Triggers commonAmongAlternates path.
+ {"\\u3013\\uA838", 2}
+ };
+
+ status = U_ZERO_ERROR;
+ IdentifierInfo identifierInfo(status);
+ for (testNum=0; testNum<LENGTHOF(scriptTests); testNum++) {
+ ScriptTest &test = scriptTests[testNum];
+ char msgBuf[100];
+ sprintf(msgBuf, "testNum = %d ", testNum);
+ UnicodeString testString = UnicodeString(test.fTestString).unescape();
+
+ status = U_ZERO_ERROR;
+ identifierInfo.setIdentifier(testString, status);
+ int32_t scriptCount = identifierInfo.getScriptCount();
+ TEST_ASSERT_MSG(test.fScriptCount == scriptCount, msgBuf);
+ }
+}
+
+void IntlTestSpoof::testScriptSet() {
+ ScriptSet s1;
+ ScriptSet s2;
+ UErrorCode status = U_ZERO_ERROR;
+
+ TEST_ASSERT(s1 == s2);
+ s1.set(USCRIPT_ARABIC,status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(!(s1 == s2));
+ TEST_ASSERT(s1.test(USCRIPT_ARABIC, status));
+ TEST_ASSERT(s1.test(USCRIPT_GREEK, status) == FALSE);
+
+ status = U_ZERO_ERROR;
+ s1.reset(USCRIPT_ARABIC, status);
+ TEST_ASSERT(s1 == s2);
+
+ status = U_ZERO_ERROR;
+ s1.setAll();
+ TEST_ASSERT(s1.test(USCRIPT_COMMON, status));
+ TEST_ASSERT(s1.test(USCRIPT_ETHIOPIC, status));
+ TEST_ASSERT(s1.test(USCRIPT_CODE_LIMIT, status));
+ s1.resetAll();
+ TEST_ASSERT(!s1.test(USCRIPT_COMMON, status));
+ TEST_ASSERT(!s1.test(USCRIPT_ETHIOPIC, status));
+ TEST_ASSERT(!s1.test(USCRIPT_CODE_LIMIT, status));
+
+ status = U_ZERO_ERROR;
+ s1.set(USCRIPT_TAKRI, status);
+ s1.set(USCRIPT_BLISSYMBOLS, status);
+ s2.setAll();
+ TEST_ASSERT(s2.contains(s1));
+ TEST_ASSERT(!s1.contains(s2));
+ TEST_ASSERT(s2.intersects(s1));
+ TEST_ASSERT(s1.intersects(s2));
+ s2.reset(USCRIPT_TAKRI, status);
+ TEST_ASSERT(!s2.contains(s1));
+ TEST_ASSERT(!s1.contains(s2));
+ TEST_ASSERT(s1.intersects(s2));
+ TEST_ASSERT(s2.intersects(s1));
+ TEST_ASSERT_SUCCESS(status);
+
+ status = U_ZERO_ERROR;
+ s1.resetAll();
+ s1.set(USCRIPT_NKO, status);
+ s1.set(USCRIPT_COMMON, status);
+ s2 = s1;
+ TEST_ASSERT(s2 == s1);
+ TEST_ASSERT_EQ(2, s2.countMembers());
+ s2.intersect(s1);
+ TEST_ASSERT(s2 == s1);
+ s2.setAll();
+ TEST_ASSERT(!(s2 == s1));
+ TEST_ASSERT(s2.countMembers() >= USCRIPT_CODE_LIMIT);
+ s2.intersect(s1);
+ TEST_ASSERT(s2 == s1);
+
+ s2.setAll();
+ s2.reset(USCRIPT_COMMON, status);
+ s2.intersect(s1);
+ TEST_ASSERT(s2.countMembers() == 1);
+
+ s1.resetAll();
+ s1.set(USCRIPT_AFAKA, status);
+ s1.set(USCRIPT_VAI, status);
+ s1.set(USCRIPT_INHERITED, status);
+ int32_t n = -1;
+ for (int32_t i=0; i<4; i++) {
+ n = s1.nextSetBit(n+1);
+ switch (i) {
+ case 0: TEST_ASSERT_EQ(USCRIPT_INHERITED, n); break;
+ case 1: TEST_ASSERT_EQ(USCRIPT_VAI, n); break;
+ case 2: TEST_ASSERT_EQ(USCRIPT_AFAKA, n); break;
+ case 3: TEST_ASSERT_EQ(-1, (int32_t)n); break;
+ default: TEST_ASSERT(FALSE);
+ }
+ }
+ TEST_ASSERT_SUCCESS(status);
+}
+
+
+void IntlTestSpoof::testRestrictionLevel() {
+ struct Test {
+ const char *fId;
+ URestrictionLevel fExpectedRestrictionLevel;
+ } tests[] = {
+ {"\\u0061\\u03B3\\u2665", USPOOF_UNRESTRICTIVE},
+ {"a", USPOOF_ASCII},
+ {"\\u03B3", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u0061\\u30A2\\u30FC", USPOOF_HIGHLY_RESTRICTIVE},
+ {"\\u0061\\u0904", USPOOF_MODERATELY_RESTRICTIVE},
+ {"\\u0061\\u03B3", USPOOF_MINIMALLY_RESTRICTIVE}
+ };
+ char msgBuffer[100];
+
+ URestrictionLevel restrictionLevels[] = { USPOOF_ASCII, USPOOF_HIGHLY_RESTRICTIVE,
+ USPOOF_MODERATELY_RESTRICTIVE, USPOOF_MINIMALLY_RESTRICTIVE, USPOOF_UNRESTRICTIVE};
+
+ UErrorCode status = U_ZERO_ERROR;
+ IdentifierInfo idInfo(status);
+ TEST_ASSERT_SUCCESS(status);
+ idInfo.setIdentifierProfile(*uspoof_getRecommendedUnicodeSet(&status));
+ TEST_ASSERT_SUCCESS(status);
+ for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
+ status = U_ZERO_ERROR;
+ const Test &test = tests[testNum];
+ UnicodeString testString = UnicodeString(test.fId).unescape();
+ URestrictionLevel expectedLevel = test.fExpectedRestrictionLevel;
+ idInfo.setIdentifier(testString, status);
+ sprintf(msgBuffer, "testNum = %d ", testNum);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_MSG(expectedLevel == idInfo.getRestrictionLevel(status), msgBuffer);
+ for (int levelIndex=0; levelIndex<LENGTHOF(restrictionLevels); levelIndex++) {
+ status = U_ZERO_ERROR;
+ URestrictionLevel levelSetInSpoofChecker = restrictionLevels[levelIndex];
+ USpoofChecker *sc = uspoof_open(&status);
+ uspoof_setChecks(sc, USPOOF_RESTRICTION_LEVEL, &status);
+ uspoof_setAllowedChars(sc, uspoof_getRecommendedSet(&status), &status);
+ uspoof_setRestrictionLevel(sc, levelSetInSpoofChecker);
+ UBool actualValue = uspoof_checkUnicodeString(sc, testString, NULL, &status) != 0;
+
+ // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
+ UBool expectedFailure = expectedLevel > levelSetInSpoofChecker ||
+ !uspoof_getRecommendedUnicodeSet(&status)->containsAll(testString);
+ sprintf(msgBuffer, "testNum = %d, levelIndex = %d", testNum, levelIndex);
+ TEST_ASSERT_MSG(expectedFailure == actualValue, msgBuffer);
+ TEST_ASSERT_SUCCESS(status);
+ uspoof_close(sc);
+ }
+ }
+}
+
+
+void IntlTestSpoof::testMixedNumbers() {
+ struct Test {
+ const char *fTestString;
+ const char *fExpectedSet;
+ } tests[] = {
+ {"1", "[0]"},
+ {"\\u0967", "[\\u0966]"},
+ {"1\\u0967", "[0\\u0966]"},
+ {"\\u0661\\u06F1", "[\\u0660\\u06F0]"}
+ };
+ UErrorCode status = U_ZERO_ERROR;
+ IdentifierInfo idInfo(status);
+ for (int32_t testNum=0; testNum < LENGTHOF(tests); testNum++) {
+ char msgBuf[100];
+ sprintf(msgBuf, "testNum = %d ", testNum);
+ Test &test = tests[testNum];
+
+ status = U_ZERO_ERROR;
+ UnicodeString testString = UnicodeString(test.fTestString).unescape();
+ UnicodeSet expectedSet(UnicodeString(test.fExpectedSet).unescape(), status);
+ idInfo.setIdentifier(testString, status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT_MSG(expectedSet == *idInfo.getNumerics(), msgBuf);
+
+ status = U_ZERO_ERROR;
+ USpoofChecker *sc = uspoof_open(&status);
+ uspoof_setChecks(sc, USPOOF_MIXED_NUMBERS, &status); // only check this
+ int32_t result = uspoof_checkUnicodeString(sc, testString, NULL, &status);
+ UBool mixedNumberFailure = ((result & USPOOF_MIXED_NUMBERS) != 0);
+ TEST_ASSERT_MSG((expectedSet.size() > 1) == mixedNumberFailure, msgBuf);
+ uspoof_close(sc);
+ }
+}
/*
**********************************************************************
-* Copyright (C) 2011, International Business Machines Corporation
+* Copyright (C) 2011-2013, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*/
void testBug8654();
+ void testIdentifierInfo();
+
+ void testScriptSet();
+
+ void testRestrictionLevel();
+
+ void testMixedNumbers();
+
// Internal function to run a single skeleton test case.
void checkSkeleton(const USpoofChecker *sc, uint32_t flags,
const char *input, const char *expected, int32_t lineNum);