From: Mark Davis Date: Fri, 30 Nov 2012 17:51:08 +0000 (+0000) Subject: ICU-7645 First cut at spoof detection changed. All marked @internal for now. X-Git-Tag: milestone-59-0-1~3290 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=95098e216b15091186e5a1b872a532f7b6d79f9c;p=icu ICU-7645 First cut at spoof detection changed. All marked @internal for now. X-SVN-Rev: 32910 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java b/icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java new file mode 100644 index 00000000000..0e1d312412a --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java @@ -0,0 +1,653 @@ +/* + *************************************************************************** + * Copyright (C) 2008-2012, Google, International Business Machines Corporation + * and others. All Rights Reserved. + *************************************************************************** + */ +package com.ibm.icu.text; + +import java.util.BitSet; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterCategory; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Freezable; + +/** + * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile + * then setIdentifier. At this point: + *
    + *
  1. call getScripts for the specific scripts in the identifier. The identifier contains at least one character in + * each of these. + *
  2. call getAlternates to get cases where a character is not limited to a single script. For example, it could be + * either Katakana or Hiragana. + *
  3. call getCommonAmongAlternates to find out if any scripts are common to all the alternates. + *
  4. call getNumerics to get a representative character (with value zero) for each of the decimal number systems in + * the identifier. + *
  5. call getRestrictionLevel to see what the UTS36 restriction level is. (This has some proposed changes from the + * current one, however.) + *
+ * + * @author markdavis + * @internal + */ +public class IdentifierInfo { + + public enum RestrictionLevel { + /** + * Only ASCII characters: U+0000..U+007F + * + * @internal + */ + ASCII, + /** + * All characters in each identifier must be from a single script, or from the combinations: Latin + Han + + * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the + * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin. + * + * @internal + */ + HIGHLY_RESTRICTIVE, + /** + * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive + * + * @internal + */ + MODERATELY_RESTRICTIVE, + /** + * Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as + * Moderately Restrictive + * + * @internal + */ + MINIMALLY_RESTRICTIVE, + /** + * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org + * + * @internal + */ + UNRESTRICTIVE + } + + private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); + + private String identifier; + private final BitSet requiredScripts = new BitSet(); + private final Set scriptSetSet = new HashSet(); + private final BitSet commonAmongAlternates = new BitSet(); + private final UnicodeSet numerics = new UnicodeSet(); + private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF); + + private IdentifierInfo clear() { + requiredScripts.clear(); + scriptSetSet.clear(); + numerics.clear(); + commonAmongAlternates.clear(); + return this; + } + + /** + * Set the identifier profile, for what is allowed. + * + * @param identifierProfile + * @return + * @internal + */ + public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) { + this.numerics.set(numerics); + return this; + } + + /** + * Get the identifier profile + * + * @return + * @internal + */ + public UnicodeSet getIdentifierProfile() { + return new UnicodeSet(identifierProfile); + } + + /** + * Set an identifier to analyse. + * + * @param identifier + * @return the identifier info. + * @internal + */ + public IdentifierInfo setIdentifier(String identifier) { + this.identifier = identifier; + clear(); + BitSet temp = new BitSet(); // Will reuse this. + int cp; + for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { + cp = Character.codePointAt(identifier, i); + // Store a representative character for each kind of decimal digit + if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { + // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value + numerics.add(cp - UCharacter.getNumericValue(cp)); + } + UScript.getScriptExtensions(cp, temp); + temp.clear(UScript.COMMON); + temp.clear(UScript.INHERITED); +// if (temp.cardinality() == 0) { +// // HACK for older version of ICU +// requiredScripts.set(UScript.getScript(cp)); +// } else + if (temp.cardinality() == 1) { + // Single script, record it. + requiredScripts.or(temp); + } else if (!requiredScripts.intersects(temp) && scriptSetSet.add(temp)) { + // If the set hasn't been added already, add it and create new temporary for the next pass, + // so we don't rewrite what's already in the set. + temp = new BitSet(); + } + } + // Now make a final pass through to remove alternates that came before singles. + // [Kana], [Kana Hira] => [Kana] + // This is relatively infrequent, so doesn't have to be optimized. + if (scriptSetSet.size() == 0) { + commonAmongAlternates.clear(); + } else { + commonAmongAlternates.set(0, UScript.CODE_LIMIT); + for (Iterator it = scriptSetSet.iterator(); it.hasNext();) { + final BitSet next = it.next(); + if (requiredScripts.intersects(next)) { + it.remove(); + } else { + // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] + for (BitSet other : scriptSetSet) { + if (next != other && contains(next, other)) { + it.remove(); + break; + } + } + } + commonAmongAlternates.and(next); // get the intersection. + } + if (commonAmongAlternates.size() == 0) { + commonAmongAlternates.clear(); + } + } + // Note that the above code doesn't minimize alternatives. That is, it does not collapse + // [[Arab Syrc Thaa]; [Arab Syrc]] to [[Arab Syrc]] + // That would be a possible optimization, but is probably not worth the extra processing + return this; + } + + static final BitSet COMMON_AND_INHERITED = set(new BitSet(), UScript.COMMON, UScript.INHERITED); + +// /** +// * Test whether an identifier has multiple scripts +// * +// * @param identifier +// * @return true if it does +// */ +// public static boolean isMultiScript(String identifier) { +// // Non-optimized code, for simplicity +// Set setOfScriptSets = new HashSet(); +// BitSet temp = new BitSet(); +// int cp; +// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { +// cp = Character.codePointAt(identifier, i); +// UScript.getScriptExtensions(cp, temp); +// if (temp.cardinality() == 0) { +// // HACK for older version of ICU +// final int script = UScript.getScript(cp); +// temp.set(script); +// } +// temp.andNot(COMMON_AND_INHERITED); +// if (temp.cardinality() != 0 && setOfScriptSets.add(temp)) { +// // If the set hasn't been added already, add it and create new temporary for the next pass, +// // so we don't rewrite what's already in the set. +// temp = new BitSet(); +// } +// } +// if (setOfScriptSets.size() == 0) { +// return true; // trivially true +// } +// temp.clear(); +// // check to see that there is at least one script common to all the sets +// boolean first = true; +// for (BitSet other : setOfScriptSets) { +// if (first) { +// temp.or(other); +// first = false; +// } else { +// temp.and(other); +// } +// } +// return temp.cardinality() != 0; +// } +// +// /** +// * Test whether an identifier has mixed number systems. +// * +// * @param identifier +// * @return true if mixed +// */ +// public static boolean hasMixedNumberSystems(String identifier) { +// int cp; +// UnicodeSet numerics = new UnicodeSet(); +// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { +// cp = Character.codePointAt(identifier, i); +// // Store a representative character for each kind of decimal digit +// switch (UCharacter.getType(cp)) { +// case UCharacterCategory.DECIMAL_DIGIT_NUMBER: +// // Just store the zero character as a representative for comparison. +// // Unicode guarantees it is cp - value +// numerics.add(cp - UCharacter.getNumericValue(cp)); +// break; +// case UCharacterCategory.OTHER_NUMBER: +// case UCharacterCategory.LETTER_NUMBER: +// throw new IllegalArgumentException("Should not be in identifiers."); +// } +// } +// return numerics.size() > 1; +// } + + /** + * Get the identifer that was analysed. + * + * @return + * @internal + */ + public String getIdentifier() { + return identifier; + } + + /** + * Get the scripts found in the identifiers + * + * @return the set of explicit scripts. + * @internal + */ + public BitSet getScripts() { + return (BitSet) requiredScripts.clone(); + } + + /** + * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then + * the set consisting of those scripts will be returned. + * + * @return the set of explicit scripts. + * @internal + */ + public Set getAlternates() { + Set result = new HashSet(); + for (BitSet item : scriptSetSet) { + result.add((BitSet) item.clone()); + } + return result; + } + + /** + * Get the representative characters (zeros) for the numerics found in the identifier. + * + * @return the set of explicit scripts. + * @internal + */ + public UnicodeSet getNumerics() { + return new UnicodeSet(numerics); + } + + /** + * Find out which scripts are in common among the alternates. + * + * @return + */ + public BitSet getCommonAmongAlternates() { + return (BitSet) commonAmongAlternates.clone(); + } + + // BitSet doesn't support "contains(...)", so we have inverted constants + // They are private; they can't be made immutable in Java. + private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA, + UScript.KATAKANA); + private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO); + private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL); + private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK, + UScript.CHEROKEE); + + /** + * Find the "tightest" restriction level that the identifier satisfies. + * + * @return the restriction level. + * @internal + */ + public RestrictionLevel getRestrictionLevel() { + if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) { + return RestrictionLevel.UNRESTRICTIVE; + } + if (ASCII.containsAll(identifier)) { + return RestrictionLevel.ASCII; + } + BitSet temp = new BitSet(); + temp.or(requiredScripts); + temp.clear(UScript.COMMON); + temp.clear(UScript.INHERITED); + // This is a bit tricky. We look at a number of factors. + // The number of scripts in the text. + // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) + // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) + final int cardinalityPlus = temp.cardinality() + (commonAmongAlternates.isEmpty() ? scriptSetSet.size() : 1); + if (cardinalityPlus < 2) { + return RestrictionLevel.HIGHLY_RESTRICTIVE; + } + if (containsWithAlternates(JAPANESE, temp) || containsWithAlternates(CHINESE, temp) + || containsWithAlternates(KOREAN, temp)) { + return RestrictionLevel.HIGHLY_RESTRICTIVE; + } + if (cardinalityPlus == 2 && temp.get(UScript.LATIN) && !temp.intersects(CONFUSABLE_WITH_LATIN)) { + return RestrictionLevel.MODERATELY_RESTRICTIVE; + } + return RestrictionLevel.MINIMALLY_RESTRICTIVE; + } + + @Override + public String toString() { + return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", " + + displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", " + + numerics.toPattern(false); + } + + private boolean containsWithAlternates(BitSet container, BitSet containee) { + if (!contains(container, containee)) { + return false; + } + for (BitSet alternatives : scriptSetSet) { + if (!container.intersects(alternatives)) { + return false; + } + } + return true; + } + + /** + * Produce a readable string of alternates. + * + * @param alternates + * @return display form + * @internal + */ + public static String displayAlternates(Collection alternates) { + StringBuilder result = new StringBuilder(); + for (BitSet item : alternates) { + if (result.length() != 0) { + result.append("; "); + } + result.append(displayScripts(item)); + } + return result.toString(); + } + + /** + * Produce a readable string of a set of scripts + * + * @param scripts + * @return + * @internal + */ + public static String displayScripts(BitSet scripts) { + StringBuilder result = new StringBuilder(); + for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) { + if (result.length() != 0) { + result.append(' '); + } + result.append(UScript.getShortName(i)); + } + return result.toString(); + } + + /** + * Parse a list of scripts into a bitset. + * + * @param scripts + * @return BitSet of UScript values. + * @internal + */ + public static BitSet parseScripts(String scriptsString) { + BitSet result = new BitSet(); + for (String item : scriptsString.trim().split(",?\\s+")) { + if (!item.isEmpty()) { + result.set(UScript.getCodeFromName(item)); + } + } + return result; + } + + /** + * Parse a list of alternates into a set of sets of UScript values. + * + * @param scriptsSetString + * @return + * @internal + */ + public static Set parseAlternates(String scriptsSetString) { + Set result = new HashSet(); + for (String item : scriptsSetString.trim().split("\\s*;\\s*")) { + if (!item.isEmpty()) { + result.add(parseScripts(item)); + } + } + return result; + } + + /** + * Test containment. Should be a method on BitSet... + * + * @param container + * @param containee + * @return + * @internal + */ + public static final boolean contains(BitSet container, BitSet containee) { + for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) { + if (!container.get(i)) { + return false; + } + } + return true; + } + + /** + * Sets a number of values at once. Should be on BitSet. + * + * @param container + * @param containee + * @return + * @internal + */ + public static final BitSet set(BitSet bitset, int... values) { + for (int value : values) { + bitset.set(value); + } + return bitset; + } + + // public static final class FreezableBitSet extends BitSet implements Freezable { + // private boolean frozen; + // + // public FreezableBitSet() { + // super(); + // } + // public FreezableBitSet(int nbits) { + // super(nbits); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#and(java.util.BitSet) + // */ + // @Override + // public void and(BitSet set) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.and(set); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#andNot(java.util.BitSet) + // */ + // @Override + // public void andNot(BitSet set) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.andNot(set); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#cardinality() + // */ + // + // @Override + // public void clear() { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.clear(); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#clear(int) + // */ + // @Override + // public void clear(int bitIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.clear(bitIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#clear(int, int) + // */ + // @Override + // public void clear(int fromIndex, int toIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.clear(fromIndex, toIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#clone() + // */ + // @Override + // public Object clone() { + // return super.clone(); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#equals(java.lang.Object) + // */ + // @Override + // public boolean equals(Object obj) { + // if (obj == null || obj.getClass() != FreezableBitSet.class) { + // return false; + // } + // return super.equals((BitSet)obj); + // } + // + // /* (non-Javadoc) + // * @see java.util.BitSet#flip(int) + // */ + // @Override + // public void flip(int bitIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.flip(bitIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#flip(int, int) + // */ + // @Override + // public void flip(int fromIndex, int toIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.flip(fromIndex, toIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#or(java.util.BitSet) + // */ + // @Override + // public void or(BitSet set) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.or(set); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#set(int) + // */ + // @Override + // public void set(int bitIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.set(bitIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#set(int, boolean) + // */ + // @Override + // public void set(int bitIndex, boolean value) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.set(bitIndex, value); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#set(int, int) + // */ + // @Override + // public void set(int fromIndex, int toIndex) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.set(fromIndex, toIndex); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#set(int, int, boolean) + // */ + // @Override + // public void set(int fromIndex, int toIndex, boolean value) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.set(fromIndex, toIndex, value); + // } + // /* (non-Javadoc) + // * @see java.util.BitSet#xor(java.util.BitSet) + // */ + // @Override + // public void xor(BitSet set) { + // if (frozen) { + // throw new UnsupportedOperationException(); + // } + // super.xor(set); + // } + // /* (non-Javadoc) + // * @see com.ibm.icu.util.Freezable#isFrozen() + // */ + // public boolean isFrozen() { + // return frozen; + // } + // /* (non-Javadoc) + // * @see com.ibm.icu.util.Freezable#freeze() + // */ + // public FreezableBitSet freeze() { + // frozen = true; + // return this; + // } + // /* (non-Javadoc) + // * @see com.ibm.icu.util.Freezable#cloneAsThawed() + // */ + // public FreezableBitSet cloneAsThawed() { + // FreezableBitSet result = new FreezableBitSet(size()); + // result.or(this); + // return result; + // } + // } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java index 24310aba903..8b7c87332d3 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java @@ -33,6 +33,7 @@ import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.IdentifierInfo.RestrictionLevel; import com.ibm.icu.util.ULocale; /** @@ -219,13 +220,28 @@ public class SpoofChecker { * @stable ICU 4.6 */ public static final int CHAR_LIMIT = 64; + + /** + * Check that an identifier is no looser than the specified RestrictionLevel. + * + * @internal + */ + public static final int RESTRICTION_LEVEL = 128; + + /** + * Check that an identifier contains only characters from a specified set of acceptable characters. See + * Builder.setAllowedChars() and Builder.setAllowedLocales(). + * + * @internal + */ + public static final int MIXED_NUMBERS = 256; /** * Enable all spoof checks. * * @stable ICU 4.6 */ - public static final int ALL_CHECKS = 0x7f; + public static final int ALL_CHECKS = 0xFFFFFFFF; // Magic number for sanity checking spoof binary resource data. static final int MAGIC = 0x3845fdef; @@ -249,6 +265,7 @@ public class SpoofChecker { UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. // for this Spoof Checker. Defaults to all chars. Set fAllowedLocales; // The list of allowed locales. + private RestrictionLevel restrictionLevel; /** * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for @@ -263,6 +280,7 @@ public class SpoofChecker { fSpoofData = null; fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); fAllowedLocales = new LinkedHashSet(); + restrictionLevel = RestrictionLevel.MINIMALLY_RESTRICTIVE; } /** @@ -279,6 +297,7 @@ public class SpoofChecker { fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed(); fAllowedLocales = new LinkedHashSet(); fAllowedLocales.addAll(src.fAllowedLocales); + restrictionLevel = src.restrictionLevel; } /** @@ -305,11 +324,12 @@ public class SpoofChecker { result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); result.fAllowedCharsSet.freeze(); result.fAllowedLocales = this.fAllowedLocales; + result.restrictionLevel = this.restrictionLevel; return result; } /** - * Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data + * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data * files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source * data is as described in UAX 39 for these files, and the content of these files is acceptable input. * @@ -447,6 +467,16 @@ public class SpoofChecker { fChecks |= CHAR_LIMIT; return this; } + + /** + * Set the loosest restriction level allowed. + * @param restrictionLevel The loosest restriction level allowed. + * @return self + */ + public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { + this.restrictionLevel = restrictionLevel; + return this; + } // Structure for the Whole Script Confusable Data // See Unicode UAX-39, Unicode Security Mechanisms, for a description of the @@ -1391,6 +1421,28 @@ public class SpoofChecker { // haven't done it yet. int scriptCount = -1; + // Allocate an identifier info if needed. + // Note: we may want to allocate one per SpoofChecker and synchronize + + IdentifierInfo identifierInfo = null; + if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) { + identifierInfo = new IdentifierInfo().setIdentifier(text); + } + + if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) { + RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel(); + if (textRestrictionLevel.compareTo(restrictionLevel) > 0) { + result |= RESTRICTION_LEVEL; + } + } + + if (0 != ((this.fChecks) & MIXED_NUMBERS)) { + UnicodeSet numerics = identifierInfo.getNumerics(); + if (numerics.size() > 1) { + result |= MIXED_NUMBERS; + } + } + if (0 != ((this.fChecks) & SINGLE_SCRIPT)) { scriptCount = this.scriptScan(text, checkResult); // no need to set failPos, it will be set to checkResult.position inside this.scriptScan @@ -1881,6 +1933,7 @@ public class SpoofChecker { private SpoofData fSpoofData; private Set fAllowedLocales; // The Set of allowed locales. private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. + private RestrictionLevel restrictionLevel; // for this Spoof Checker. Defaults to all chars. // diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java index e2f17d809f2..5b832fc35fd 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2009-2011, International Business Machines Corporation and * + * Copyright (C) 2009-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -10,7 +10,11 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.text.ParseException; +import java.util.Arrays; +import java.util.BitSet; +import java.util.HashSet; import java.util.LinkedHashSet; +import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -19,8 +23,12 @@ import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; import com.ibm.icu.dev.test.TestUtil.JavaVendor; import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.IdentifierInfo; +import com.ibm.icu.text.IdentifierInfo.RestrictionLevel; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.SpoofChecker; +import com.ibm.icu.text.SpoofChecker.CheckResult; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; @@ -185,7 +193,7 @@ public class SpoofCheckerTest extends TestFmwk { * don't want to see in this test. */ sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); - + SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); checkResults = sc.failsChecks(goodLatin); assertFalse("", checkResults); @@ -254,7 +262,7 @@ public class SpoofCheckerTest extends TestFmwk { assertTrue("", checkResults); assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks); assertEquals("", 2, result.position); - + result.position = 666; checkResults = sc.failsChecks(han_Hiragana, result); assertFalse("", checkResults); @@ -294,7 +302,7 @@ public class SpoofCheckerTest extends TestFmwk { public void TestSpoofAPI() { SpoofChecker sc = new SpoofChecker.Builder().build(); String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts. - // If this test starts failing, consult confusablesWholeScript.txt + // If this test starts failing, consult confusablesWholeScript.txt SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); result.position = 666; boolean checkResults = sc.failsChecks(s, result); @@ -317,7 +325,7 @@ public class SpoofCheckerTest extends TestFmwk { SpoofChecker sc = new SpoofChecker.Builder().build(); checkSkeleton(sc, "TestSkeleton"); } - + // testSkeleton. Spot check a number of confusable skeleton substitutions from the // Unicode data file confusables.txt // Test cases chosen for substitutions of various lengths, and @@ -337,11 +345,11 @@ public class SpoofCheckerTest extends TestFmwk { + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations." + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.", - " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." - + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." - + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." - + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.", - testName); + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." + + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." + + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations." + + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.", + testName); checkSkeleton(sc, SL, "nochange", "nochange", testName); checkSkeleton(sc, MA, "love", "love", testName); @@ -428,6 +436,100 @@ public class SpoofCheckerTest extends TestFmwk { assertEquals("", 7, result.position); } + public void TestRestrictionLevel() { + Object[][] tests = { + {"a", RestrictionLevel.ASCII}, + {"γ", RestrictionLevel.HIGHLY_RESTRICTIVE}, + {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE}, + {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE}, + {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE}, + }; + IdentifierInfo idInfo = new IdentifierInfo(); + CheckResult checkResult = new CheckResult(); + for (Object[] test : tests) { + String testString = (String) test[0]; + RestrictionLevel expectedLevel = (RestrictionLevel) test[1]; + idInfo.setIdentifier(testString); + assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel()); + for (RestrictionLevel testLevel : RestrictionLevel.values()) { + SpoofChecker sc = new SpoofChecker.Builder() + .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this + .setRestrictionLevel(testLevel) + .build(); + boolean actualValue = sc.failsChecks(testString, checkResult); + + // we want to fail if the text is (say) MODERATE and the testLevel is ASCII + boolean expectedFailure = expectedLevel.compareTo(testLevel) > 0; + boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + testLevel, expectedFailure, actualValue); +// if (!t) { // debugging +// actualValue = sc.failsChecks(testString, checkResult); +// // we want to fail if the text is (say) MODERATE and the testLevel is ASCII +// expectedFailure = expectedLevel.compareTo(testLevel) > 0; +// } + } + } + } + + public void TestMixedNumbers() { + Object[][] tests = { + {"1", "[0]"}, + {"१", "[०]"}, + {"1१", "[0०]"}, + {"١۱", "[٠۰]"}, + }; + IdentifierInfo idInfo = new IdentifierInfo(); + CheckResult checkResult = new CheckResult(); + for (Object[] test : tests) { + String testString = (String) test[0]; + UnicodeSet expected = new UnicodeSet((String)test[1]); + idInfo.setIdentifier(testString); + assertEquals("", expected, idInfo.getNumerics()); + + SpoofChecker sc = new SpoofChecker.Builder() + .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this + .build(); + boolean actualValue = sc.failsChecks(testString, checkResult); + boolean t = assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue); + } + } + + public void TestIdentifierInfo() { +// contains(BitSet, BitSet) + BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL); + BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL); + assertTrue("", IdentifierInfo.contains(bitset12, bitset2)); + assertTrue("", IdentifierInfo.contains(bitset12, bitset12)); + assertTrue("", !IdentifierInfo.contains(bitset2, bitset12)); + +// displayAlternates(Collection) +// displayScripts(BitSet) + String scriptString = IdentifierInfo.displayScripts(bitset12); + assertEquals("", "Hang Latn", scriptString); + Set alternates = new HashSet(Arrays.asList(bitset12, bitset2)); + String alternatesString = IdentifierInfo.displayAlternates(alternates); + assertEquals("", "Hang Latn; Hang", alternatesString); + +// parseAlternates(String) +// parseScripts(String) + assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString)); + assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString)); + + IdentifierInfo idInfo = new IdentifierInfo(); + String manyAlternates = "aアー〼1१١۱"; + idInfo.setIdentifier(manyAlternates); + assertEquals("", manyAlternates, idInfo.getIdentifier()); + + assertEquals("", null, idInfo.getScripts()); + assertEquals("", null, idInfo.getAlternates()); + assertEquals("", null, idInfo.getCommonAmongAlternates()); + assertEquals("", null, idInfo.getNumerics()); + assertEquals("", null, idInfo.getRestrictionLevel()); + +// TODO +// getIdentifierProfile() +// setIdentifierProfile(UnicodeSet) + } + private String parseHex(String in) { StringBuilder sb = new StringBuilder(); for (String oneCharAsHexString : in.split("\\s+")) { @@ -483,7 +585,7 @@ public class SpoofCheckerTest extends TestFmwk { Matcher parseLine = Pattern.compile( "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)" + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line - matcher(""); + matcher(""); Normalizer2 normalizer = Normalizer2.getNFDInstance(); int lineNum = 0; String inputLine;