]> granicus.if.org Git - icu/commitdiff
ICU-7645 First cut at spoof detection changed. All marked @internal for now.
authorMark Davis <mark@macchiato.com>
Fri, 30 Nov 2012 17:51:08 +0000 (17:51 +0000)
committerMark Davis <mark@macchiato.com>
Fri, 30 Nov 2012 17:51:08 +0000 (17:51 +0000)
X-SVN-Rev: 32910

icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java [new file with mode: 0644]
icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java b/icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java
new file mode 100644 (file)
index 0000000..0e1d312
--- /dev/null
@@ -0,0 +1,653 @@
+/*
+ ***************************************************************************
+ * Copyright (C) 2008-2012, Google, International Business Machines Corporation
+ * and others. All Rights Reserved.
+ ***************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UCharacterCategory;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.Freezable;
+
+/**
+ * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
+ * then setIdentifier. At this point:
+ * <ol>
+ * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
+ * each of these.
+ * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
+ * either Katakana or Hiragana.
+ * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
+ * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
+ * the identifier.
+ * <li>call getRestrictionLevel to see what the UTS36 restriction level is. (This has some proposed changes from the
+ * current one, however.)
+ * </ol>
+ * 
+ * @author markdavis
+ * @internal
+ */
+public class IdentifierInfo {
+
+    public enum RestrictionLevel {
+        /**
+         * Only ASCII characters: U+0000..U+007F
+         * 
+         * @internal
+         */
+        ASCII,
+        /**
+         * All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
+         * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
+         * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
+         * 
+         * @internal
+         */
+        HIGHLY_RESTRICTIVE,
+        /**
+         * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
+         * 
+         * @internal
+         */
+        MODERATELY_RESTRICTIVE,
+        /**
+         * Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as
+         * Moderately Restrictive
+         * 
+         * @internal
+         */
+        MINIMALLY_RESTRICTIVE,
+        /**
+         * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
+         * 
+         * @internal
+         */
+        UNRESTRICTIVE
+    }
+
+    private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
+
+    private String identifier;
+    private final BitSet requiredScripts = new BitSet();
+    private final Set<BitSet> scriptSetSet = new HashSet<BitSet>();
+    private final BitSet commonAmongAlternates = new BitSet();
+    private final UnicodeSet numerics = new UnicodeSet();
+    private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF);
+
+    private IdentifierInfo clear() {
+        requiredScripts.clear();
+        scriptSetSet.clear();
+        numerics.clear();
+        commonAmongAlternates.clear();
+        return this;
+    }
+
+    /**
+     * Set the identifier profile, for what is allowed.
+     * 
+     * @param identifierProfile
+     * @return
+     * @internal
+     */
+    public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) {
+        this.numerics.set(numerics);
+        return this;
+    }
+
+    /**
+     * Get the identifier profile
+     * 
+     * @return
+     * @internal
+     */
+    public UnicodeSet getIdentifierProfile() {
+        return new UnicodeSet(identifierProfile);
+    }
+
+    /**
+     * Set an identifier to analyse.
+     * 
+     * @param identifier
+     * @return the identifier info.
+     * @internal
+     */
+    public IdentifierInfo setIdentifier(String identifier) {
+        this.identifier = identifier;
+        clear();
+        BitSet temp = new BitSet(); // Will reuse this.
+        int cp;
+        for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+            cp = Character.codePointAt(identifier, i);
+            // Store a representative character for each kind of decimal digit
+            if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
+                // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
+                numerics.add(cp - UCharacter.getNumericValue(cp));
+            }
+            UScript.getScriptExtensions(cp, temp);
+            temp.clear(UScript.COMMON);
+            temp.clear(UScript.INHERITED);
+//            if (temp.cardinality() == 0) {
+//                // HACK for older version of ICU
+//                requiredScripts.set(UScript.getScript(cp));
+//            } else 
+            if (temp.cardinality() == 1) {
+                // Single script, record it.
+                requiredScripts.or(temp);
+            } else if (!requiredScripts.intersects(temp) && scriptSetSet.add(temp)) {
+                // If the set hasn't been added already, add it and create new temporary for the next pass,
+                // so we don't rewrite what's already in the set.
+                temp = new BitSet();
+            }
+        }
+        // Now make a final pass through to remove alternates that came before singles.
+        // [Kana], [Kana Hira] => [Kana]
+        // This is relatively infrequent, so doesn't have to be optimized.
+        if (scriptSetSet.size() == 0) {
+            commonAmongAlternates.clear();
+        } else {
+            commonAmongAlternates.set(0, UScript.CODE_LIMIT);
+            for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext();) {
+                final BitSet next = it.next();
+                if (requiredScripts.intersects(next)) {
+                    it.remove();
+                } else {
+                    // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
+                    for (BitSet other : scriptSetSet) {
+                        if (next != other && contains(next, other)) {
+                            it.remove();
+                            break;
+                        }
+                    }
+                }
+                commonAmongAlternates.and(next); // get the intersection.
+            }
+            if (commonAmongAlternates.size() == 0) {
+                commonAmongAlternates.clear();
+            }
+        }
+        // Note that the above code doesn't minimize alternatives. That is, it does not collapse
+        // [[Arab Syrc Thaa]; [Arab Syrc]] to [[Arab Syrc]]
+        // That would be a possible optimization, but is probably not worth the extra processing
+        return this;
+    }
+
+    static final BitSet COMMON_AND_INHERITED = set(new BitSet(), UScript.COMMON, UScript.INHERITED);
+
+//    /**
+//     * Test whether an identifier has multiple scripts
+//     * 
+//     * @param identifier
+//     * @return true if it does
+//     */
+//    public static boolean isMultiScript(String identifier) {
+//        // Non-optimized code, for simplicity
+//        Set<BitSet> setOfScriptSets = new HashSet<BitSet>();
+//        BitSet temp = new BitSet();
+//        int cp;
+//        for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+//            cp = Character.codePointAt(identifier, i);
+//            UScript.getScriptExtensions(cp, temp);
+//            if (temp.cardinality() == 0) {
+//                // HACK for older version of ICU
+//                final int script = UScript.getScript(cp);
+//                temp.set(script);
+//            }
+//            temp.andNot(COMMON_AND_INHERITED);
+//            if (temp.cardinality() != 0 && setOfScriptSets.add(temp)) {
+//                // If the set hasn't been added already, add it and create new temporary for the next pass,
+//                // so we don't rewrite what's already in the set.
+//                temp = new BitSet();
+//            }
+//        }
+//        if (setOfScriptSets.size() == 0) {
+//            return true; // trivially true
+//        }
+//        temp.clear();
+//        // check to see that there is at least one script common to all the sets
+//        boolean first = true;
+//        for (BitSet other : setOfScriptSets) {
+//            if (first) {
+//                temp.or(other);
+//                first = false;
+//            } else {
+//                temp.and(other);
+//            }
+//        }
+//        return temp.cardinality() != 0;
+//    }
+//
+//    /**
+//     * Test whether an identifier has mixed number systems.
+//     * 
+//     * @param identifier
+//     * @return true if mixed
+//     */
+//    public static boolean hasMixedNumberSystems(String identifier) {
+//        int cp;
+//        UnicodeSet numerics = new UnicodeSet();
+//        for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
+//            cp = Character.codePointAt(identifier, i);
+//            // Store a representative character for each kind of decimal digit
+//            switch (UCharacter.getType(cp)) {
+//            case UCharacterCategory.DECIMAL_DIGIT_NUMBER:
+//                // Just store the zero character as a representative for comparison.
+//                // Unicode guarantees it is cp - value
+//                numerics.add(cp - UCharacter.getNumericValue(cp));
+//                break;
+//            case UCharacterCategory.OTHER_NUMBER:
+//            case UCharacterCategory.LETTER_NUMBER:
+//                throw new IllegalArgumentException("Should not be in identifiers.");
+//            }
+//        }
+//        return numerics.size() > 1;
+//    }
+
+    /**
+     * Get the identifer that was analysed.
+     * 
+     * @return
+     * @internal
+     */
+    public String getIdentifier() {
+        return identifier;
+    }
+
+    /**
+     * Get the scripts found in the identifiers
+     * 
+     * @return the set of explicit scripts.
+     * @internal
+     */
+    public BitSet getScripts() {
+        return (BitSet) requiredScripts.clone();
+    }
+
+    /**
+     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
+     * the set consisting of those scripts will be returned.
+     * 
+     * @return the set of explicit scripts.
+     * @internal
+     */
+    public Set<BitSet> getAlternates() {
+        Set<BitSet> result = new HashSet<BitSet>();
+        for (BitSet item : scriptSetSet) {
+            result.add((BitSet) item.clone());
+        }
+        return result;
+    }
+
+    /**
+     * Get the representative characters (zeros) for the numerics found in the identifier.
+     * 
+     * @return the set of explicit scripts.
+     * @internal
+     */
+    public UnicodeSet getNumerics() {
+        return new UnicodeSet(numerics);
+    }
+
+    /**
+     * Find out which scripts are in common among the alternates.
+     * 
+     * @return
+     */
+    public BitSet getCommonAmongAlternates() {
+        return (BitSet) commonAmongAlternates.clone();
+    }
+
+    // BitSet doesn't support "contains(...)", so we have inverted constants
+    // They are private; they can't be made immutable in Java.
+    private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA,
+            UScript.KATAKANA);
+    private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO);
+    private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL);
+    private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK,
+            UScript.CHEROKEE);
+
+    /**
+     * Find the "tightest" restriction level that the identifier satisfies.
+     * 
+     * @return the restriction level.
+     * @internal
+     */
+    public RestrictionLevel getRestrictionLevel() {
+        if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
+            return RestrictionLevel.UNRESTRICTIVE;
+        }
+        if (ASCII.containsAll(identifier)) {
+            return RestrictionLevel.ASCII;
+        }
+        BitSet temp = new BitSet();
+        temp.or(requiredScripts);
+        temp.clear(UScript.COMMON);
+        temp.clear(UScript.INHERITED);
+        // This is a bit tricky. We look at a number of factors.
+        // The number of scripts in the text.
+        // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
+        // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
+        final int cardinalityPlus = temp.cardinality() + (commonAmongAlternates.isEmpty() ? scriptSetSet.size() : 1);
+        if (cardinalityPlus < 2) {
+            return RestrictionLevel.HIGHLY_RESTRICTIVE;
+        }
+        if (containsWithAlternates(JAPANESE, temp) || containsWithAlternates(CHINESE, temp)
+                || containsWithAlternates(KOREAN, temp)) {
+            return RestrictionLevel.HIGHLY_RESTRICTIVE;
+        }
+        if (cardinalityPlus == 2 && temp.get(UScript.LATIN) && !temp.intersects(CONFUSABLE_WITH_LATIN)) {
+            return RestrictionLevel.MODERATELY_RESTRICTIVE;
+        }
+        return RestrictionLevel.MINIMALLY_RESTRICTIVE;
+    }
+
+    @Override
+    public String toString() {
+        return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", "
+                + displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", "
+                + numerics.toPattern(false);
+    }
+
+    private boolean containsWithAlternates(BitSet container, BitSet containee) {
+        if (!contains(container, containee)) {
+            return false;
+        }
+        for (BitSet alternatives : scriptSetSet) {
+            if (!container.intersects(alternatives)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Produce a readable string of alternates.
+     * 
+     * @param alternates
+     * @return display form
+     * @internal
+     */
+    public static String displayAlternates(Collection<BitSet> alternates) {
+        StringBuilder result = new StringBuilder();
+        for (BitSet item : alternates) {
+            if (result.length() != 0) {
+                result.append("; ");
+            }
+            result.append(displayScripts(item));
+        }
+        return result.toString();
+    }
+
+    /**
+     * Produce a readable string of a set of scripts
+     * 
+     * @param scripts
+     * @return
+     * @internal
+     */
+    public static String displayScripts(BitSet scripts) {
+        StringBuilder result = new StringBuilder();
+        for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
+            if (result.length() != 0) {
+                result.append(' ');
+            }
+            result.append(UScript.getShortName(i));
+        }
+        return result.toString();
+    }
+
+    /**
+     * Parse a list of scripts into a bitset.
+     * 
+     * @param scripts
+     * @return BitSet of UScript values.
+     * @internal
+     */
+    public static BitSet parseScripts(String scriptsString) {
+        BitSet result = new BitSet();
+        for (String item : scriptsString.trim().split(",?\\s+")) {
+            if (!item.isEmpty()) {
+                result.set(UScript.getCodeFromName(item));
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Parse a list of alternates into a set of sets of UScript values.
+     * 
+     * @param scriptsSetString
+     * @return
+     * @internal
+     */
+    public static Set<BitSet> parseAlternates(String scriptsSetString) {
+        Set<BitSet> result = new HashSet<BitSet>();
+        for (String item : scriptsSetString.trim().split("\\s*;\\s*")) {
+            if (!item.isEmpty()) {
+                result.add(parseScripts(item));
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Test containment. Should be a method on BitSet...
+     * 
+     * @param container
+     * @param containee
+     * @return
+     * @internal
+     */
+    public static final boolean contains(BitSet container, BitSet containee) {
+        for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) {
+            if (!container.get(i)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Sets a number of values at once. Should be on BitSet.
+     * 
+     * @param container
+     * @param containee
+     * @return
+     * @internal
+     */
+    public static final BitSet set(BitSet bitset, int... values) {
+        for (int value : values) {
+            bitset.set(value);
+        }
+        return bitset;
+    }
+
+    // public static final class FreezableBitSet extends BitSet implements Freezable<FreezableBitSet> {
+    // private boolean frozen;
+    //
+    // public FreezableBitSet() {
+    // super();
+    // }
+    // public FreezableBitSet(int nbits) {
+    // super(nbits);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#and(java.util.BitSet)
+    // */
+    // @Override
+    // public void and(BitSet set) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.and(set);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#andNot(java.util.BitSet)
+    // */
+    // @Override
+    // public void andNot(BitSet set) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.andNot(set);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#cardinality()
+    // */
+    //
+    // @Override
+    // public void clear() {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.clear();
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#clear(int)
+    // */
+    // @Override
+    // public void clear(int bitIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.clear(bitIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#clear(int, int)
+    // */
+    // @Override
+    // public void clear(int fromIndex, int toIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.clear(fromIndex, toIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#clone()
+    // */
+    // @Override
+    // public Object clone() {
+    // return super.clone();
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#equals(java.lang.Object)
+    // */
+    // @Override
+    // public boolean equals(Object obj) {
+    // if (obj == null || obj.getClass() != FreezableBitSet.class) {
+    // return false;
+    // }
+    // return super.equals((BitSet)obj);
+    // }
+    //
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#flip(int)
+    // */
+    // @Override
+    // public void flip(int bitIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.flip(bitIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#flip(int, int)
+    // */
+    // @Override
+    // public void flip(int fromIndex, int toIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.flip(fromIndex, toIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#or(java.util.BitSet)
+    // */
+    // @Override
+    // public void or(BitSet set) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.or(set);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#set(int)
+    // */
+    // @Override
+    // public void set(int bitIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.set(bitIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#set(int, boolean)
+    // */
+    // @Override
+    // public void set(int bitIndex, boolean value) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.set(bitIndex, value);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#set(int, int)
+    // */
+    // @Override
+    // public void set(int fromIndex, int toIndex) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.set(fromIndex, toIndex);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#set(int, int, boolean)
+    // */
+    // @Override
+    // public void set(int fromIndex, int toIndex, boolean value) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.set(fromIndex, toIndex, value);
+    // }
+    // /* (non-Javadoc)
+    // * @see java.util.BitSet#xor(java.util.BitSet)
+    // */
+    // @Override
+    // public void xor(BitSet set) {
+    // if (frozen) {
+    // throw new UnsupportedOperationException();
+    // }
+    // super.xor(set);
+    // }
+    // /* (non-Javadoc)
+    // * @see com.ibm.icu.util.Freezable#isFrozen()
+    // */
+    // public boolean isFrozen() {
+    // return frozen;
+    // }
+    // /* (non-Javadoc)
+    // * @see com.ibm.icu.util.Freezable#freeze()
+    // */
+    // public FreezableBitSet freeze() {
+    // frozen = true;
+    // return this;
+    // }
+    // /* (non-Javadoc)
+    // * @see com.ibm.icu.util.Freezable#cloneAsThawed()
+    // */
+    // public FreezableBitSet cloneAsThawed() {
+    // FreezableBitSet result = new FreezableBitSet(size());
+    // result.or(this);
+    // return result;
+    // }
+    // }
+}
index 24310aba903476bdac6a287db2e7febbf7555d04..8b7c87332d39b18d908035f9aeb5c87155de8805 100644 (file)
@@ -33,6 +33,7 @@ import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
 import com.ibm.icu.util.ULocale;
 
 /**
@@ -219,13 +220,28 @@ public class SpoofChecker {
      * @stable ICU 4.6
      */
     public static final int CHAR_LIMIT = 64;
+    
+    /**
+     * Check that an identifier is no looser than the specified RestrictionLevel.
+     * 
+     * @internal
+     */
+    public static final int RESTRICTION_LEVEL = 128;
+
+    /**
+     * Check that an identifier contains only characters from a specified set of acceptable characters. See
+     * Builder.setAllowedChars() and Builder.setAllowedLocales().
+     * 
+     * @internal
+     */
+    public static final int MIXED_NUMBERS = 256;
 
     /**
      * Enable all spoof checks.
      * 
      * @stable ICU 4.6
      */
-    public static final int ALL_CHECKS = 0x7f;
+    public static final int ALL_CHECKS = 0xFFFFFFFF;
 
     // Magic number for sanity checking spoof binary resource data.
     static final int MAGIC = 0x3845fdef;
@@ -249,6 +265,7 @@ public class SpoofChecker {
         UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
                                      // for this Spoof Checker. Defaults to all chars.
         Set<ULocale> fAllowedLocales; // The list of allowed locales.
+        private RestrictionLevel restrictionLevel;
 
         /**
          * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
@@ -263,6 +280,7 @@ public class SpoofChecker {
             fSpoofData = null;
             fAllowedCharsSet = new UnicodeSet(0, 0x10ffff);
             fAllowedLocales = new LinkedHashSet<ULocale>();
+            restrictionLevel = RestrictionLevel.MINIMALLY_RESTRICTIVE;
         }
 
         /**
@@ -279,6 +297,7 @@ public class SpoofChecker {
             fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed();
             fAllowedLocales = new LinkedHashSet<ULocale>();
             fAllowedLocales.addAll(src.fAllowedLocales);
+            restrictionLevel = src.restrictionLevel;
         }
 
         /**
@@ -305,11 +324,12 @@ public class SpoofChecker {
             result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
             result.fAllowedCharsSet.freeze();
             result.fAllowedLocales = this.fAllowedLocales;
+            result.restrictionLevel = this.restrictionLevel;
             return result;
         }
 
         /**
-         * Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data
+         * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data
          * files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source
          * data is as described in UAX 39 for these files, and the content of these files is acceptable input.
          * 
@@ -447,6 +467,16 @@ public class SpoofChecker {
             fChecks |= CHAR_LIMIT;
             return this;
         }
+        
+        /**
+         * Set the loosest restriction level allowed.
+         * @param restrictionLevel The loosest restriction level allowed.
+         * @return self
+         */
+        public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
+            this.restrictionLevel = restrictionLevel;
+            return this;
+        }
 
         // Structure for the Whole Script Confusable Data
         // See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
@@ -1391,6 +1421,28 @@ public class SpoofChecker {
         // haven't done it yet.
         int scriptCount = -1;
 
+        // Allocate an identifier info if needed.
+        // Note: we may want to allocate one per SpoofChecker and synchronize
+        
+        IdentifierInfo identifierInfo = null;
+        if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) {
+            identifierInfo = new IdentifierInfo().setIdentifier(text);
+        }
+        
+        if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) {
+            RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel();
+            if (textRestrictionLevel.compareTo(restrictionLevel) > 0) {
+                result |= RESTRICTION_LEVEL;
+            }
+        }
+        
+        if (0 != ((this.fChecks) & MIXED_NUMBERS)) {
+            UnicodeSet numerics = identifierInfo.getNumerics();
+            if (numerics.size() > 1) {
+                result |= MIXED_NUMBERS;
+            }
+        }
+        
         if (0 != ((this.fChecks) & SINGLE_SCRIPT)) {
             scriptCount = this.scriptScan(text, checkResult);
             // no need to set failPos, it will be set to checkResult.position inside this.scriptScan
@@ -1881,6 +1933,7 @@ public class SpoofChecker {
     private SpoofData fSpoofData;
     private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
     private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
+    private RestrictionLevel restrictionLevel;
 
     // for this Spoof Checker. Defaults to all chars.
     //
index e2f17d809f2706beb96ed17d0bc01b4cbba73434..5b832fc35fdaf8822371af6bf7e07b1109696d53 100644 (file)
@@ -1,6 +1,6 @@
 /*
  *******************************************************************************
- * Copyright (C) 2009-2011, International Business Machines Corporation and    *
+ * Copyright (C) 2009-2012, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
@@ -10,7 +10,11 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.text.ParseException;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -19,8 +23,12 @@ import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.dev.test.TestUtil.JavaVendor;
 import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.IdentifierInfo;
+import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.SpoofChecker;
+import com.ibm.icu.text.SpoofChecker.CheckResult;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ULocale;
 
@@ -185,7 +193,7 @@ public class SpoofCheckerTest extends TestFmwk {
          * don't want to see in this test.
          */
         sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
-        
+
         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
         checkResults = sc.failsChecks(goodLatin);
         assertFalse("", checkResults);
@@ -254,7 +262,7 @@ public class SpoofCheckerTest extends TestFmwk {
         assertTrue("", checkResults);
         assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
         assertEquals("", 2, result.position);
-        
+
         result.position = 666;
         checkResults = sc.failsChecks(han_Hiragana, result);
         assertFalse("", checkResults);
@@ -294,7 +302,7 @@ public class SpoofCheckerTest extends TestFmwk {
     public void TestSpoofAPI() {
         SpoofChecker sc = new SpoofChecker.Builder().build();
         String s = "xyz";  // Many latin ranges are whole-script confusable with other scripts.
-                           // If this test starts failing, consult confusablesWholeScript.txt
+        // If this test starts failing, consult confusablesWholeScript.txt
         SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
         result.position = 666;
         boolean checkResults = sc.failsChecks(s, result);
@@ -317,7 +325,7 @@ public class SpoofCheckerTest extends TestFmwk {
         SpoofChecker sc = new SpoofChecker.Builder().build();
         checkSkeleton(sc, "TestSkeleton");
     }
-    
+
     // testSkeleton. Spot check a number of confusable skeleton substitutions from the
     // Unicode data file confusables.txt
     // Test cases chosen for substitutions of various lengths, and
@@ -337,11 +345,11 @@ public class SpoofCheckerTest extends TestFmwk {
                         + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                         + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                         + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
-                " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
-                        + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
-                        + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
-                        + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
-                testName);
+                        " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+                                + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+                                + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+                                + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
+                                testName);
 
         checkSkeleton(sc, SL, "nochange", "nochange", testName);
         checkSkeleton(sc, MA, "love", "love", testName);
@@ -428,6 +436,100 @@ public class SpoofCheckerTest extends TestFmwk {
         assertEquals("", 7, result.position);
     }
 
+    public void TestRestrictionLevel() {
+        Object[][] tests = {
+                {"a", RestrictionLevel.ASCII},
+                {"γ", RestrictionLevel.HIGHLY_RESTRICTIVE},
+                {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE},
+                {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE},
+                {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE},
+        };
+        IdentifierInfo idInfo = new IdentifierInfo();
+        CheckResult checkResult = new CheckResult();
+        for (Object[] test : tests) {
+            String testString = (String) test[0];
+            RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
+            idInfo.setIdentifier(testString);
+            assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel());
+            for (RestrictionLevel testLevel : RestrictionLevel.values()) {
+                SpoofChecker sc = new SpoofChecker.Builder()
+                .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
+                .setRestrictionLevel(testLevel)
+                .build();
+                boolean actualValue = sc.failsChecks(testString, checkResult);
+
+                // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
+                boolean expectedFailure = expectedLevel.compareTo(testLevel) > 0;
+                boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + testLevel, expectedFailure, actualValue);
+//                if (!t) { // debugging
+//                    actualValue = sc.failsChecks(testString, checkResult);
+//                    // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
+//                    expectedFailure = expectedLevel.compareTo(testLevel) > 0;
+//                }
+            }
+        }
+    }
+
+    public void TestMixedNumbers() {
+        Object[][] tests = {
+                {"1", "[0]"},
+                {"१", "[०]"},
+                {"1१", "[0०]"},
+                {"١۱", "[٠۰]"},
+        };
+        IdentifierInfo idInfo = new IdentifierInfo();
+        CheckResult checkResult = new CheckResult();
+        for (Object[] test : tests) {
+            String testString = (String) test[0];
+            UnicodeSet expected = new UnicodeSet((String)test[1]);
+            idInfo.setIdentifier(testString);
+            assertEquals("", expected, idInfo.getNumerics());
+
+            SpoofChecker sc = new SpoofChecker.Builder()
+            .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
+            .build();
+            boolean actualValue = sc.failsChecks(testString, checkResult);
+            boolean t = assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
+        }
+    }
+    
+    public void TestIdentifierInfo() {
+//        contains(BitSet, BitSet)
+        BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL);
+        BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL);
+        assertTrue("", IdentifierInfo.contains(bitset12, bitset2));
+        assertTrue("", IdentifierInfo.contains(bitset12, bitset12));
+        assertTrue("", !IdentifierInfo.contains(bitset2, bitset12));
+
+//      displayAlternates(Collection<BitSet>)
+//      displayScripts(BitSet)
+        String scriptString = IdentifierInfo.displayScripts(bitset12);
+        assertEquals("", "Hang Latn", scriptString);
+        Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2));
+        String alternatesString = IdentifierInfo.displayAlternates(alternates);
+        assertEquals("", "Hang Latn; Hang", alternatesString);
+
+//        parseAlternates(String)
+//        parseScripts(String)
+        assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString));
+        assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString));
+
+        IdentifierInfo idInfo = new IdentifierInfo();
+        String manyAlternates = "aアー〼1१١۱";
+        idInfo.setIdentifier(manyAlternates);
+        assertEquals("", manyAlternates, idInfo.getIdentifier());
+
+        assertEquals("", null, idInfo.getScripts());
+        assertEquals("", null, idInfo.getAlternates());
+        assertEquals("", null, idInfo.getCommonAmongAlternates());
+        assertEquals("", null, idInfo.getNumerics());
+        assertEquals("", null, idInfo.getRestrictionLevel());
+
+// TODO
+//        getIdentifierProfile()
+//        setIdentifierProfile(UnicodeSet)
+    }
+
     private String parseHex(String in) {
         StringBuilder sb = new StringBuilder();
         for (String oneCharAsHexString : in.split("\\s+")) {
@@ -483,7 +585,7 @@ public class SpoofCheckerTest extends TestFmwk {
             Matcher parseLine = Pattern.compile(
                     "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
                             + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
-                    matcher("");
+                            matcher("");
             Normalizer2 normalizer = Normalizer2.getNFDInstance();
             int lineNum = 0;
             String inputLine;