--- /dev/null
+/*
+ ***************************************************************************
+ * Copyright (C) 2008-2012, Google, International Business Machines Corporation
+ * and others. All Rights Reserved.
+ ***************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UCharacterCategory;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.Freezable;
+
+/**
+ * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
+ * then setIdentifier. At this point:
+ * <ol>
+ * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
+ * each of these.
+ * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
+ * either Katakana or Hiragana.
+ * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
+ * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
+ * the identifier.
+ * <li>call getRestrictionLevel to see what the UTS36 restriction level is. (This has some proposed changes from the
+ * current one, however.)
+ * </ol>
+ *
+ * @author markdavis
+ * @internal
+ */
+public class IdentifierInfo {
+
+ public enum RestrictionLevel {
+ /**
+ * Only ASCII characters: U+0000..U+007F
+ *
+ * @internal
+ */
+ ASCII,
+ /**
+ * All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
+ * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
+ * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
+ *
+ * @internal
+ */
+ HIGHLY_RESTRICTIVE,
+ /**
+ * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
+ *
+ * @internal
+ */
+ MODERATELY_RESTRICTIVE,
+ /**
+ * Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as
+ * Moderately Restrictive
+ *
+ * @internal
+ */
+ MINIMALLY_RESTRICTIVE,
+ /**
+ * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
+ *
+ * @internal
+ */
+ UNRESTRICTIVE
+ }
+
+ private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
+
+ private String identifier;
+ private final BitSet requiredScripts = new BitSet();
+ private final Set<BitSet> scriptSetSet = new HashSet<BitSet>();
+ private final BitSet commonAmongAlternates = new BitSet();
+ private final UnicodeSet numerics = new UnicodeSet();
+ private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF);
+
+ private IdentifierInfo clear() {
+ requiredScripts.clear();
+ scriptSetSet.clear();
+ numerics.clear();
+ commonAmongAlternates.clear();
+ return this;
+ }
+
+ /**
+ * Set the identifier profile, for what is allowed.
+ *
+ * @param identifierProfile
+ * @return
+ * @internal
+ */
+ public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) {
+ this.numerics.set(numerics);
+ return this;
+ }
+
+ /**
+ * Get the identifier profile
+ *
+ * @return
+ * @internal
+ */
+ public UnicodeSet getIdentifierProfile() {
+ return new UnicodeSet(identifierProfile);
+ }
+
+ /**
+ * Set an identifier to analyse.
+ *
+ * @param identifier
+ * @return the identifier info.
+ * @internal
+ */
+ public IdentifierInfo setIdentifier(String identifier) {
+ this.identifier = identifier;
+ clear();
+ BitSet temp = new BitSet(); // Will reuse this.
+ int cp;
+ for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+ cp = Character.codePointAt(identifier, i);
+ // Store a representative character for each kind of decimal digit
+ if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
+ // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
+ numerics.add(cp - UCharacter.getNumericValue(cp));
+ }
+ UScript.getScriptExtensions(cp, temp);
+ temp.clear(UScript.COMMON);
+ temp.clear(UScript.INHERITED);
+// if (temp.cardinality() == 0) {
+// // HACK for older version of ICU
+// requiredScripts.set(UScript.getScript(cp));
+// } else
+ if (temp.cardinality() == 1) {
+ // Single script, record it.
+ requiredScripts.or(temp);
+ } else if (!requiredScripts.intersects(temp) && scriptSetSet.add(temp)) {
+ // If the set hasn't been added already, add it and create new temporary for the next pass,
+ // so we don't rewrite what's already in the set.
+ temp = new BitSet();
+ }
+ }
+ // Now make a final pass through to remove alternates that came before singles.
+ // [Kana], [Kana Hira] => [Kana]
+ // This is relatively infrequent, so doesn't have to be optimized.
+ if (scriptSetSet.size() == 0) {
+ commonAmongAlternates.clear();
+ } else {
+ commonAmongAlternates.set(0, UScript.CODE_LIMIT);
+ for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext();) {
+ final BitSet next = it.next();
+ if (requiredScripts.intersects(next)) {
+ it.remove();
+ } else {
+ // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
+ for (BitSet other : scriptSetSet) {
+ if (next != other && contains(next, other)) {
+ it.remove();
+ break;
+ }
+ }
+ }
+ commonAmongAlternates.and(next); // get the intersection.
+ }
+ if (commonAmongAlternates.size() == 0) {
+ commonAmongAlternates.clear();
+ }
+ }
+ // Note that the above code doesn't minimize alternatives. That is, it does not collapse
+ // [[Arab Syrc Thaa]; [Arab Syrc]] to [[Arab Syrc]]
+ // That would be a possible optimization, but is probably not worth the extra processing
+ return this;
+ }
+
+ static final BitSet COMMON_AND_INHERITED = set(new BitSet(), UScript.COMMON, UScript.INHERITED);
+
+// /**
+// * Test whether an identifier has multiple scripts
+// *
+// * @param identifier
+// * @return true if it does
+// */
+// public static boolean isMultiScript(String identifier) {
+// // Non-optimized code, for simplicity
+// Set<BitSet> setOfScriptSets = new HashSet<BitSet>();
+// BitSet temp = new BitSet();
+// int cp;
+// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+// cp = Character.codePointAt(identifier, i);
+// UScript.getScriptExtensions(cp, temp);
+// if (temp.cardinality() == 0) {
+// // HACK for older version of ICU
+// final int script = UScript.getScript(cp);
+// temp.set(script);
+// }
+// temp.andNot(COMMON_AND_INHERITED);
+// if (temp.cardinality() != 0 && setOfScriptSets.add(temp)) {
+// // If the set hasn't been added already, add it and create new temporary for the next pass,
+// // so we don't rewrite what's already in the set.
+// temp = new BitSet();
+// }
+// }
+// if (setOfScriptSets.size() == 0) {
+// return true; // trivially true
+// }
+// temp.clear();
+// // check to see that there is at least one script common to all the sets
+// boolean first = true;
+// for (BitSet other : setOfScriptSets) {
+// if (first) {
+// temp.or(other);
+// first = false;
+// } else {
+// temp.and(other);
+// }
+// }
+// return temp.cardinality() != 0;
+// }
+//
+// /**
+// * Test whether an identifier has mixed number systems.
+// *
+// * @param identifier
+// * @return true if mixed
+// */
+// public static boolean hasMixedNumberSystems(String identifier) {
+// int cp;
+// UnicodeSet numerics = new UnicodeSet();
+// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+// cp = Character.codePointAt(identifier, i);
+// // Store a representative character for each kind of decimal digit
+// switch (UCharacter.getType(cp)) {
+// case UCharacterCategory.DECIMAL_DIGIT_NUMBER:
+// // Just store the zero character as a representative for comparison.
+// // Unicode guarantees it is cp - value
+// numerics.add(cp - UCharacter.getNumericValue(cp));
+// break;
+// case UCharacterCategory.OTHER_NUMBER:
+// case UCharacterCategory.LETTER_NUMBER:
+// throw new IllegalArgumentException("Should not be in identifiers.");
+// }
+// }
+// return numerics.size() > 1;
+// }
+
+ /**
+ * Get the identifer that was analysed.
+ *
+ * @return
+ * @internal
+ */
+ public String getIdentifier() {
+ return identifier;
+ }
+
+ /**
+ * Get the scripts found in the identifiers
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ public BitSet getScripts() {
+ return (BitSet) requiredScripts.clone();
+ }
+
+ /**
+ * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
+ * the set consisting of those scripts will be returned.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ public Set<BitSet> getAlternates() {
+ Set<BitSet> result = new HashSet<BitSet>();
+ for (BitSet item : scriptSetSet) {
+ result.add((BitSet) item.clone());
+ }
+ return result;
+ }
+
+ /**
+ * Get the representative characters (zeros) for the numerics found in the identifier.
+ *
+ * @return the set of explicit scripts.
+ * @internal
+ */
+ public UnicodeSet getNumerics() {
+ return new UnicodeSet(numerics);
+ }
+
+ /**
+ * Find out which scripts are in common among the alternates.
+ *
+ * @return
+ */
+ public BitSet getCommonAmongAlternates() {
+ return (BitSet) commonAmongAlternates.clone();
+ }
+
+ // BitSet doesn't support "contains(...)", so we have inverted constants
+ // They are private; they can't be made immutable in Java.
+ private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA,
+ UScript.KATAKANA);
+ private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO);
+ private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL);
+ private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK,
+ UScript.CHEROKEE);
+
+ /**
+ * Find the "tightest" restriction level that the identifier satisfies.
+ *
+ * @return the restriction level.
+ * @internal
+ */
+ public RestrictionLevel getRestrictionLevel() {
+ if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
+ return RestrictionLevel.UNRESTRICTIVE;
+ }
+ if (ASCII.containsAll(identifier)) {
+ return RestrictionLevel.ASCII;
+ }
+ BitSet temp = new BitSet();
+ temp.or(requiredScripts);
+ temp.clear(UScript.COMMON);
+ temp.clear(UScript.INHERITED);
+ // This is a bit tricky. We look at a number of factors.
+ // The number of scripts in the text.
+ // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
+ // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
+ final int cardinalityPlus = temp.cardinality() + (commonAmongAlternates.isEmpty() ? scriptSetSet.size() : 1);
+ if (cardinalityPlus < 2) {
+ return RestrictionLevel.HIGHLY_RESTRICTIVE;
+ }
+ if (containsWithAlternates(JAPANESE, temp) || containsWithAlternates(CHINESE, temp)
+ || containsWithAlternates(KOREAN, temp)) {
+ return RestrictionLevel.HIGHLY_RESTRICTIVE;
+ }
+ if (cardinalityPlus == 2 && temp.get(UScript.LATIN) && !temp.intersects(CONFUSABLE_WITH_LATIN)) {
+ return RestrictionLevel.MODERATELY_RESTRICTIVE;
+ }
+ return RestrictionLevel.MINIMALLY_RESTRICTIVE;
+ }
+
+ @Override
+ public String toString() {
+ return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", "
+ + displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", "
+ + numerics.toPattern(false);
+ }
+
+ private boolean containsWithAlternates(BitSet container, BitSet containee) {
+ if (!contains(container, containee)) {
+ return false;
+ }
+ for (BitSet alternatives : scriptSetSet) {
+ if (!container.intersects(alternatives)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Produce a readable string of alternates.
+ *
+ * @param alternates
+ * @return display form
+ * @internal
+ */
+ public static String displayAlternates(Collection<BitSet> alternates) {
+ StringBuilder result = new StringBuilder();
+ for (BitSet item : alternates) {
+ if (result.length() != 0) {
+ result.append("; ");
+ }
+ result.append(displayScripts(item));
+ }
+ return result.toString();
+ }
+
+ /**
+ * Produce a readable string of a set of scripts
+ *
+ * @param scripts
+ * @return
+ * @internal
+ */
+ public static String displayScripts(BitSet scripts) {
+ StringBuilder result = new StringBuilder();
+ for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
+ if (result.length() != 0) {
+ result.append(' ');
+ }
+ result.append(UScript.getShortName(i));
+ }
+ return result.toString();
+ }
+
+ /**
+ * Parse a list of scripts into a bitset.
+ *
+ * @param scripts
+ * @return BitSet of UScript values.
+ * @internal
+ */
+ public static BitSet parseScripts(String scriptsString) {
+ BitSet result = new BitSet();
+ for (String item : scriptsString.trim().split(",?\\s+")) {
+ if (!item.isEmpty()) {
+ result.set(UScript.getCodeFromName(item));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Parse a list of alternates into a set of sets of UScript values.
+ *
+ * @param scriptsSetString
+ * @return
+ * @internal
+ */
+ public static Set<BitSet> parseAlternates(String scriptsSetString) {
+ Set<BitSet> result = new HashSet<BitSet>();
+ for (String item : scriptsSetString.trim().split("\\s*;\\s*")) {
+ if (!item.isEmpty()) {
+ result.add(parseScripts(item));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Test containment. Should be a method on BitSet...
+ *
+ * @param container
+ * @param containee
+ * @return
+ * @internal
+ */
+ public static final boolean contains(BitSet container, BitSet containee) {
+ for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) {
+ if (!container.get(i)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Sets a number of values at once. Should be on BitSet.
+ *
+ * @param container
+ * @param containee
+ * @return
+ * @internal
+ */
+ public static final BitSet set(BitSet bitset, int... values) {
+ for (int value : values) {
+ bitset.set(value);
+ }
+ return bitset;
+ }
+
+ // public static final class FreezableBitSet extends BitSet implements Freezable<FreezableBitSet> {
+ // private boolean frozen;
+ //
+ // public FreezableBitSet() {
+ // super();
+ // }
+ // public FreezableBitSet(int nbits) {
+ // super(nbits);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#and(java.util.BitSet)
+ // */
+ // @Override
+ // public void and(BitSet set) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.and(set);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#andNot(java.util.BitSet)
+ // */
+ // @Override
+ // public void andNot(BitSet set) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.andNot(set);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#cardinality()
+ // */
+ //
+ // @Override
+ // public void clear() {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.clear();
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#clear(int)
+ // */
+ // @Override
+ // public void clear(int bitIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.clear(bitIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#clear(int, int)
+ // */
+ // @Override
+ // public void clear(int fromIndex, int toIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.clear(fromIndex, toIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#clone()
+ // */
+ // @Override
+ // public Object clone() {
+ // return super.clone();
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#equals(java.lang.Object)
+ // */
+ // @Override
+ // public boolean equals(Object obj) {
+ // if (obj == null || obj.getClass() != FreezableBitSet.class) {
+ // return false;
+ // }
+ // return super.equals((BitSet)obj);
+ // }
+ //
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#flip(int)
+ // */
+ // @Override
+ // public void flip(int bitIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.flip(bitIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#flip(int, int)
+ // */
+ // @Override
+ // public void flip(int fromIndex, int toIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.flip(fromIndex, toIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#or(java.util.BitSet)
+ // */
+ // @Override
+ // public void or(BitSet set) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.or(set);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#set(int)
+ // */
+ // @Override
+ // public void set(int bitIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.set(bitIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#set(int, boolean)
+ // */
+ // @Override
+ // public void set(int bitIndex, boolean value) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.set(bitIndex, value);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#set(int, int)
+ // */
+ // @Override
+ // public void set(int fromIndex, int toIndex) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.set(fromIndex, toIndex);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#set(int, int, boolean)
+ // */
+ // @Override
+ // public void set(int fromIndex, int toIndex, boolean value) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.set(fromIndex, toIndex, value);
+ // }
+ // /* (non-Javadoc)
+ // * @see java.util.BitSet#xor(java.util.BitSet)
+ // */
+ // @Override
+ // public void xor(BitSet set) {
+ // if (frozen) {
+ // throw new UnsupportedOperationException();
+ // }
+ // super.xor(set);
+ // }
+ // /* (non-Javadoc)
+ // * @see com.ibm.icu.util.Freezable#isFrozen()
+ // */
+ // public boolean isFrozen() {
+ // return frozen;
+ // }
+ // /* (non-Javadoc)
+ // * @see com.ibm.icu.util.Freezable#freeze()
+ // */
+ // public FreezableBitSet freeze() {
+ // frozen = true;
+ // return this;
+ // }
+ // /* (non-Javadoc)
+ // * @see com.ibm.icu.util.Freezable#cloneAsThawed()
+ // */
+ // public FreezableBitSet cloneAsThawed() {
+ // FreezableBitSet result = new FreezableBitSet(size());
+ // result.or(this);
+ // return result;
+ // }
+ // }
+}
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
import com.ibm.icu.util.ULocale;
/**
* @stable ICU 4.6
*/
public static final int CHAR_LIMIT = 64;
+
+ /**
+ * Check that an identifier is no looser than the specified RestrictionLevel.
+ *
+ * @internal
+ */
+ public static final int RESTRICTION_LEVEL = 128;
+
+ /**
+ * Check that an identifier contains only characters from a specified set of acceptable characters. See
+ * Builder.setAllowedChars() and Builder.setAllowedLocales().
+ *
+ * @internal
+ */
+ public static final int MIXED_NUMBERS = 256;
/**
* Enable all spoof checks.
*
* @stable ICU 4.6
*/
- public static final int ALL_CHECKS = 0x7f;
+ public static final int ALL_CHECKS = 0xFFFFFFFF;
// Magic number for sanity checking spoof binary resource data.
static final int MAGIC = 0x3845fdef;
UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
Set<ULocale> fAllowedLocales; // The list of allowed locales.
+ private RestrictionLevel restrictionLevel;
/**
* Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
fSpoofData = null;
fAllowedCharsSet = new UnicodeSet(0, 0x10ffff);
fAllowedLocales = new LinkedHashSet<ULocale>();
+ restrictionLevel = RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
/**
fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed();
fAllowedLocales = new LinkedHashSet<ULocale>();
fAllowedLocales.addAll(src.fAllowedLocales);
+ restrictionLevel = src.restrictionLevel;
}
/**
result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
result.fAllowedCharsSet.freeze();
result.fAllowedLocales = this.fAllowedLocales;
+ result.restrictionLevel = this.restrictionLevel;
return result;
}
/**
- * Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data
+ * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data
* files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source
* data is as described in UAX 39 for these files, and the content of these files is acceptable input.
*
fChecks |= CHAR_LIMIT;
return this;
}
+
+ /**
+ * Set the loosest restriction level allowed.
+ * @param restrictionLevel The loosest restriction level allowed.
+ * @return self
+ */
+ public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
+ this.restrictionLevel = restrictionLevel;
+ return this;
+ }
// Structure for the Whole Script Confusable Data
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
// haven't done it yet.
int scriptCount = -1;
+ // Allocate an identifier info if needed.
+ // Note: we may want to allocate one per SpoofChecker and synchronize
+
+ IdentifierInfo identifierInfo = null;
+ if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) {
+ identifierInfo = new IdentifierInfo().setIdentifier(text);
+ }
+
+ if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) {
+ RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel();
+ if (textRestrictionLevel.compareTo(restrictionLevel) > 0) {
+ result |= RESTRICTION_LEVEL;
+ }
+ }
+
+ if (0 != ((this.fChecks) & MIXED_NUMBERS)) {
+ UnicodeSet numerics = identifierInfo.getNumerics();
+ if (numerics.size() > 1) {
+ result |= MIXED_NUMBERS;
+ }
+ }
+
if (0 != ((this.fChecks) & SINGLE_SCRIPT)) {
scriptCount = this.scriptScan(text, checkResult);
// no need to set failPos, it will be set to checkResult.position inside this.scriptScan
private SpoofData fSpoofData;
private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
+ private RestrictionLevel restrictionLevel;
// for this Spoof Checker. Defaults to all chars.
//
/*
*******************************************************************************
- * Copyright (C) 2009-2011, International Business Machines Corporation and *
+ * Copyright (C) 2009-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashSet;
import java.util.LinkedHashSet;
+import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.IdentifierInfo;
+import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
+import com.ibm.icu.text.SpoofChecker.CheckResult;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
* don't want to see in this test.
*/
sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
-
+
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
checkResults = sc.failsChecks(goodLatin);
assertFalse("", checkResults);
assertTrue("", checkResults);
assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
assertEquals("", 2, result.position);
-
+
result.position = 666;
checkResults = sc.failsChecks(han_Hiragana, result);
assertFalse("", checkResults);
public void TestSpoofAPI() {
SpoofChecker sc = new SpoofChecker.Builder().build();
String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts.
- // If this test starts failing, consult confusablesWholeScript.txt
+ // If this test starts failing, consult confusablesWholeScript.txt
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
result.position = 666;
boolean checkResults = sc.failsChecks(s, result);
SpoofChecker sc = new SpoofChecker.Builder().build();
checkSkeleton(sc, "TestSkeleton");
}
-
+
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
// Unicode data file confusables.txt
// Test cases chosen for substitutions of various lengths, and
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
- " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
- + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
- + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
- + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
- testName);
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
+ testName);
checkSkeleton(sc, SL, "nochange", "nochange", testName);
checkSkeleton(sc, MA, "love", "love", testName);
assertEquals("", 7, result.position);
}
+ public void TestRestrictionLevel() {
+ Object[][] tests = {
+ {"a", RestrictionLevel.ASCII},
+ {"γ", RestrictionLevel.HIGHLY_RESTRICTIVE},
+ {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE},
+ {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE},
+ {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE},
+ };
+ IdentifierInfo idInfo = new IdentifierInfo();
+ CheckResult checkResult = new CheckResult();
+ for (Object[] test : tests) {
+ String testString = (String) test[0];
+ RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
+ idInfo.setIdentifier(testString);
+ assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel());
+ for (RestrictionLevel testLevel : RestrictionLevel.values()) {
+ SpoofChecker sc = new SpoofChecker.Builder()
+ .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
+ .setRestrictionLevel(testLevel)
+ .build();
+ boolean actualValue = sc.failsChecks(testString, checkResult);
+
+ // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
+ boolean expectedFailure = expectedLevel.compareTo(testLevel) > 0;
+ boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + testLevel, expectedFailure, actualValue);
+// if (!t) { // debugging
+// actualValue = sc.failsChecks(testString, checkResult);
+// // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
+// expectedFailure = expectedLevel.compareTo(testLevel) > 0;
+// }
+ }
+ }
+ }
+
+ public void TestMixedNumbers() {
+ Object[][] tests = {
+ {"1", "[0]"},
+ {"१", "[०]"},
+ {"1१", "[0०]"},
+ {"١۱", "[٠۰]"},
+ };
+ IdentifierInfo idInfo = new IdentifierInfo();
+ CheckResult checkResult = new CheckResult();
+ for (Object[] test : tests) {
+ String testString = (String) test[0];
+ UnicodeSet expected = new UnicodeSet((String)test[1]);
+ idInfo.setIdentifier(testString);
+ assertEquals("", expected, idInfo.getNumerics());
+
+ SpoofChecker sc = new SpoofChecker.Builder()
+ .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
+ .build();
+ boolean actualValue = sc.failsChecks(testString, checkResult);
+ boolean t = assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
+ }
+ }
+
+ public void TestIdentifierInfo() {
+// contains(BitSet, BitSet)
+ BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL);
+ BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL);
+ assertTrue("", IdentifierInfo.contains(bitset12, bitset2));
+ assertTrue("", IdentifierInfo.contains(bitset12, bitset12));
+ assertTrue("", !IdentifierInfo.contains(bitset2, bitset12));
+
+// displayAlternates(Collection<BitSet>)
+// displayScripts(BitSet)
+ String scriptString = IdentifierInfo.displayScripts(bitset12);
+ assertEquals("", "Hang Latn", scriptString);
+ Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2));
+ String alternatesString = IdentifierInfo.displayAlternates(alternates);
+ assertEquals("", "Hang Latn; Hang", alternatesString);
+
+// parseAlternates(String)
+// parseScripts(String)
+ assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString));
+ assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString));
+
+ IdentifierInfo idInfo = new IdentifierInfo();
+ String manyAlternates = "aアー〼1१١۱";
+ idInfo.setIdentifier(manyAlternates);
+ assertEquals("", manyAlternates, idInfo.getIdentifier());
+
+ assertEquals("", null, idInfo.getScripts());
+ assertEquals("", null, idInfo.getAlternates());
+ assertEquals("", null, idInfo.getCommonAmongAlternates());
+ assertEquals("", null, idInfo.getNumerics());
+ assertEquals("", null, idInfo.getRestrictionLevel());
+
+// TODO
+// getIdentifierProfile()
+// setIdentifierProfile(UnicodeSet)
+ }
+
private String parseHex(String in) {
StringBuilder sb = new StringBuilder();
for (String oneCharAsHexString : in.split("\\s+")) {
Matcher parseLine = Pattern.compile(
"\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
+ "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
- matcher("");
+ matcher("");
Normalizer2 normalizer = Normalizer2.getNFDInstance();
int lineNum = 0;
String inputLine;