From 8d59026fd4617d38e5ed4f98d1274f45eb67234d Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Tue, 13 Sep 2016 22:16:02 +0000 Subject: [PATCH] ICU-12549 Updating SpoofChecker to latest Unicode specification (Java version). X-SVN-Rev: 39219 --- .../src/com/ibm/icu/text/SpoofChecker.java | 2309 +++++++---------- icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/icutzdata.jar | 2 +- icu4j/main/shared/data/testdata.jar | 2 +- .../icu/dev/test/text/SpoofCheckerTest.java | 378 ++- 5 files changed, 1080 insertions(+), 1615 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java index 8b97a1df733..a7ad518105a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java @@ -11,7 +11,6 @@ package com.ibm.icu.text; -import java.io.DataOutputStream; import java.io.IOException; import java.io.LineNumberReader; import java.io.Reader; @@ -19,12 +18,14 @@ import java.nio.ByteBuffer; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Hashtable; import java.util.LinkedHashSet; import java.util.Locale; +import java.util.MissingResourceException; import java.util.Set; import java.util.Vector; import java.util.regex.Matcher; @@ -32,8 +33,6 @@ import java.util.regex.Pattern; import com.ibm.icu.impl.ICUBinary; import com.ibm.icu.impl.ICUBinary.Authenticate; -import com.ibm.icu.impl.Trie2; -import com.ibm.icu.impl.Trie2Writable; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UProperty; @@ -41,147 +40,225 @@ import com.ibm.icu.lang.UScript; import com.ibm.icu.util.ULocale; /** + *

+ * This class, based on Unicode Technical Report #36 and + * Unicode Technical Standard #39, has two main functions: * - * Unicode Security and Spoofing Detection. + *

    + *
  1. Checking whether two strings are visually confusable with each other, such as "desordenado" and + * "ԁеѕогԁепаԁо".
  2. + *
  3. Checking whether an individual string is likely to be an attempt at confusing the reader (spoof + * detection), such as "pаypаl" spelled with Cyrillic 'а' characters.
  4. + *
* - *

This class is intended to check strings, typically - * identifiers of some type, such as URLs, for the presence of - * characters that are likely to be visually confusing - - * for cases where the displayed form of an identifier may - * not be what it appears to be. + *

+ * Although originally designed as a method for flagging suspicious identifier strings such as URLs, + * SpoofChecker has a number of other practical use cases, such as preventing attempts to evade bad-word + * content filters. * - *

Unicode Technical Report #36, - * http://unicode.org/reports/tr36 and - * Unicode Technical Standard #39, - * http://unicode.org/reports/tr39 - * "Unicode security considerations", give more background on - * security and spoofing issues with Unicode identifiers. - * The tests and checks provided by this module implement the recommendations - * from these Unicode documents. + *

Confusables

* - *

The tests available on identifiers fall into two general categories: - *

+ *

+ * The following example shows how to use SpoofChecker to check for confusability between two strings: * - *

The steps to perform confusability testing are - *

+ *
+ * 
+ * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
+ * int result = sc.areConfusable("desordenado", "ԁеѕогԁепаԁо");
+ * System.out.println(result != 0);  // true
+ * 
+ * 
* - *

A SpoofChecker instance may be used repeatedly to perform checks on any number - * of identifiers. + *

+ * SpoofChecker uses a builder paradigm: options are specified within the context of a lightweight + * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading + * operations are performed, and an immutable SpoofChecker is returned. * - *

Thread Safety: The methods on SpoofChecker objects are thread safe. - * The test functions for checking a single identifier, or for testing - * whether two identifiers are potentially confusable, may called concurrently - * from multiple threads using the same SpoofChecker instance. + *

+ * The first line of the example creates a SpoofChecker object with confusable-checking enabled; the second + * line performs the confusability test. For best performance, the instance should be created once (e.g., upon + * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime. * + *

+ * UTS 39 defines two strings to be confusable if they map to the same skeleton string. A skeleton can + * be thought of as a "hash code". {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so + * the following snippet is equivalent to the example above: * - *

Descriptions of the available checks. + *

+ * 
+ * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
+ * boolean result = sc.getSkeleton("desordenado").equals(sc.getSkeleton("ԁеѕогԁепаԁо"));
+ * System.out.println(result);  // true
+ * 
+ * 
* - *

When testing whether pairs of identifiers are confusable, with areConfusable() - * the relevant tests are + *

+ * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling + * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as + * shown below: * - *

+ *
+ * 
+ * // Setup:
+ * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
+ * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
+ * HashSet<String> skeletons = new HashSet<String>();
+ * for (String word : DICTIONARY) {
+ *   skeletons.add(sc.getSkeleton(word));
+ * }
  *
- * 

The safest approach is to enable all three of these checks as a group. + * // Live Check: + * boolean result = skeletons.contains(sc.getSkeleton("1orern")); + * System.out.println(result); // true + * + *

* - *

ANY_CASE is a modifier for the above tests. If the identifiers being checked can - * be of mixed case and are used in a case-sensitive manner, this option should be specified. + *

+ * Note: Since the Unicode confusables mapping table is frequently updated, confusable skeletons are not + * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons + * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. * - *

If the identifiers being checked are used in a case-insensitive manner, and if they are - * displayed to users in lower-case form only, the ANY_CASE option should not be - * specified. Confusabality issues involving upper case letters will not be reported. + *

Spoof Detection

* - *

When performing tests on a single identifier, with the check() family of functions, - * the relevant tests are: + *

+ * The following snippet shows a minimal example of using SpoofChecker to perform spoof detection on a + * string: * - *

+ * + *
+ * SpoofChecker sc = new SpoofChecker.Builder()
+ *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
+ *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
+ *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
+ *     .build();
+ * boolean result = sc.failsChecks("pаypаl");  // with Cyrillic 'а' characters
+ * System.out.println(result);  // true
+ * 
+ *
* - *

Note on Scripts: - *

Characters from the Unicode Scripts "Common" and "Inherited" are ignored when considering - * the script of an identifier. Common characters include digits and symbols that - * are normally used with text from many different scripts.
+ *

+ * As in the case for confusability checking, it is good practice to create one SpoofChecker instance at + * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of + * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the + * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the + * instance to perform confusability checking. + * + *

+ * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}: + * + *

+ * 
+ * SpoofChecker sc = new SpoofChecker.Builder()
+ *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
+ *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
+ *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
+ *     .build();
+ * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
+ * boolean result = sc.failsChecks("pаypаl", checkResult);
+ * System.out.println(checkResult.checks);  // 16
+ * 
+ * 
+ * + *

+ * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: + * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: + * + *

+ * + *

+ * These checks can be enabled independently of each other. For example, if you were interested in checking for only the + * INVISIBLE and MIXED_NUMBERS conditions, you could do: + * + *

+ * 
+ * SpoofChecker sc = new SpoofChecker.Builder()
+ *     .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
+ *     .build();
+ * boolean result = sc.failsChecks("৪8");
+ * System.out.println(result);  // true
+ * 
+ * 
+ * + *

+ * Note: The Restriction Level is the most powerful of the checks. The full logic is documented in + * UTS 39, but the basic idea is that strings + * are restricted to contain characters from only a single script, except that most scripts are allowed to have + * Latin characters interspersed. Although the default restriction level is HIGHLY_RESTRICTIVE, it is + * recommended that users set their restriction level to MODERATELY_RESTRICTIVE, which allows Latin mixed + * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on + * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of + * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code + * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple + * scripts. + * + *

Additional Information

+ * + *

+ * A SpoofChecker instance may be used repeatedly to perform checks on any number of identifiers. + * + *

+ * Thread Safety: The methods on SpoofChecker objects are thread safe. The test functions for + * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called + * concurrently from multiple threads using the same SpoofChecker instance. * * @stable ICU 4.6 */ public class SpoofChecker { /** - * Constants from UAX 31 for use in setRestrictionLevel. + * Constants from UTS 39 for use in setRestrictionLevel. + * * @stable ICU 53 */ public enum RestrictionLevel { /** - * Only ASCII characters: U+0000..U+007F + * All characters in the string are in the identifier profile and all characters in the string are in the ASCII + * range. * * @stable ICU 53 */ ASCII, /** - * All characters in each identifier must be from a single script. + * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the + * string is single-script, according to the definition in UTS 39 section 5.1. * * @stable ICU 53 */ SINGLE_SCRIPT_RESTRICTIVE, - /** - * All characters in each identifier must be from a single script, or from the combinations: Latin + Han + - * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the - * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin. + /** + * The string classifies as Single Script, or all characters in the string are in the identifier profile and the + * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1: + *

* * @stable ICU 53 */ HIGHLY_RESTRICTIVE, /** - * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive + * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile + * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, + * Greek, and Cherokee. + * + * This is the default restriction level as of ICU 58. * * @stable ICU 53 */ MODERATELY_RESTRICTIVE, /** - * Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as - * Moderately Restrictive + * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as + * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. * * @stable ICU 53 */ @@ -191,91 +268,89 @@ public class SpoofChecker { * * @stable ICU 53 */ - UNRESTRICTIVE + UNRESTRICTIVE, } - /** - * Security Profile constant from UAX 31 for use in setAllowedChars. - * Will probably be replaced by UnicodeSet property. - * @internal - * @deprecated This API is ICU internal only. + * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. + * + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated - public static final UnicodeSet INCLUSION = new UnicodeSet("[" + - "\\u0027\\u002D-\\u002E\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"+ - "\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]").freeze(); - // Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 6.3.0 + public static final UnicodeSet INCLUSION = new UnicodeSet( + "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u" + + "2019\\u2027\\u30A0\\u30FB]").freeze(); + // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt + // There is tooling to generate this constant in the unicodetools project: + // org.unicode.text.tools.RecommendedSetGenerator + // It will print the Java and C++ code to the console for easy copy-paste into this file. /** - * Security Profile constant from UAX 31 for use in setAllowedChars. - * Will probably be replaced by UnicodeSet property. - * @internal - * @deprecated This API is ICU internal only. + * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. + * + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated public static final UnicodeSet RECOMMENDED = new UnicodeSet( - "[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6" + - "\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0-\\u01A1" + - "\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B" + - "\\u021E-\\u021F\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C" + - "\\u030F-\\u0311\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331" + - "\\u0335\\u0338-\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C" + - "\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u052E-\\u052F\\u0531-\\u0556" + - "\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655" + - "\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6" + - "\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2\\u0901-\\u094D\\u094F-\\u0950" + - "\\u0956-\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983" + - "\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9" + - "\\u09BC-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1" + - "\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32" + - "\\u0A35\\u0A38-\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C" + - "\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0" + - "\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0" + - "\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28" + - "\\u0B2A-\\u0B30\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D" + - "\\u0B56-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A" + - "\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4" + - "\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0" + - "\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28" + - "\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56" + - "\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8" + - "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6" + - "\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10" + - "\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61" + - "\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5" + - "\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6" + - "\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59" + - "\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F" + - "\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD" + - "\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29" + - "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56" + - "\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80" + - "\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6" + - "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D" + - "\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D" + - "\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0" + - "\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310" + - "\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7" + - "\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99" + - "\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D" + - "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78" + - "\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8" + - "\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC" + - "\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6" + - "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6" + - "\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA" + - "\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660-\\uA661" + - "\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793" + - "\\uA7A0-\\uA7AA\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06" + - "\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F" + - "\\uFA11\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6" + - "\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]" - ).freeze(); - // Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 8.0.0 - // There is no tooling to generate this from the .txt file, - // copy the set contents from ICU4C source/i18n/uspoof.cpp recommendedPat. - // (Add '+' for string concatenation.) + "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014" + + "8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E" + + "6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B" + + "C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03" + + "28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386" + + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05" + + "2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0" + + "620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-" + + "\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2" + + "\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096" + + "F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0" + + "9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u" + + "09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-" + + "\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\" + + "u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9" + + "3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0" + + "ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\" + + "u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47" + + "\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83" + + "\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3" + + "\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B" + + "D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u" + + "0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56" + + "\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92" + + "-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0" + + "CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0" + + "D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57" + + "\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9" + + "6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0" + + "DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\" + + "u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u" + + "0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\" + + "u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29" + + "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F" + + "56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0" + + "F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6" + + "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10" + + "C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u" + + "1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2" + + "-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1" + + "315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-" + + "\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9" + + "9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1" + + "F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F" + + "7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1" + + "FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-" + + "\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0" + + "-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3" + + "005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u" + + "3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B" + + "\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE" + + "\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB" + + "11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF" + + "A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0" + + "002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]").freeze(); + // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt + // There is tooling to generate this constant in the unicodetools project: + // org.unicode.text.tools.RecommendedSetGenerator + // It will print the Java and C++ code to the console for easy copy-paste into this file. /** * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of @@ -284,63 +359,66 @@ public class SpoofChecker { */ /** - * Single script confusable test. When testing whether two identifiers are confusable, report that they are if both - * are from the same script and they are visually confusable. Note: this test is not applicable to a check of a - * single identifier. + * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section + * 4. * * @stable ICU 4.6 */ public static final int SINGLE_SCRIPT_CONFUSABLE = 1; /** - * Mixed script confusable test. - * - *

When checking a single identifier, report a problem if the identifier contains multiple scripts, and is also - * confusable with some other identifier in a single script. - * - *

When testing whether two identifiers are confusable, report that they are if the two IDs are visually confusable, - * and and at least one contains characters from more than one script. + * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are not from the same script, according to UTS + * 39 section 4. * * @stable ICU 4.6 */ public static final int MIXED_SCRIPT_CONFUSABLE = 2; /** - * Whole script confusable test. - * - *

When checking a single identifier, report a problem if The identifier is of a single script, and there exists a - * confusable identifier in another script. - * - *

When testing whether two Identifiers are confusable, report that they are if each is of a single script, the - * scripts of the two identifiers are different, and the identifiers are visually confusable. + * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are not from the same script but both of them are + * single-script strings, according to UTS 39 section 4. * * @stable ICU 4.6 */ public static final int WHOLE_SCRIPT_CONFUSABLE = 4; /** - * Any Case Modifier for confusable identifier tests. + * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the + * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make + * {@link SpoofChecker#areConfusable} return only those types of confusables. * - *

When specified, consider all characters, of any case, when looking for confusables. If ANY_CASE is not specified, - * identifiers being checked are assumed to have been case folded, and upper case conusable characters will not be - * checked. + *

+ * Note: if you wish to use {@link SpoofChecker#getSkeleton}, it is required that you enable at least one of the + * CONFUSABLE flags. * - * @stable ICU 4.6 + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - public static final int ANY_CASE = 8; + public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE; /** - * Check that an identifier is no looser than the specified RestrictionLevel. - * The default if this is not called is HIGHLY_RESTRICTIVE. + * This flag is deprecated and no longer affects the behavior of SpoofChecker. * - * @internal - * @deprecated This API is ICU internal only. + * @deprecated ICU 58 This API was deprecated in UTS 39 revision 11 and is no longer used. */ @Deprecated + public static final int ANY_CASE = 8; + + /** + * Check that an identifier satisfies the requirements for the restriction level specified in + * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is + * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. + * + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. + */ public static final int RESTRICTION_LEVEL = 16; /** - * Check that an identifer contains only characters from a single script (plus chars from the common and inherited + * Check that an identifier contains only characters from a single script (plus chars from the common and inherited * scripts.) Applies to checks of a single identifier check only. * * @deprecated ICU 51 Use RESTRICTION_LEVEL @@ -359,19 +437,20 @@ public class SpoofChecker { /** * Check that an identifier contains only characters from a specified set of acceptable characters. See - * Builder.setAllowedChars() and Builder.setAllowedLocales(). + * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check + * will also fail the {@link #RESTRICTION_LEVEL} check. * * @stable ICU 4.6 */ public static final int CHAR_LIMIT = 64; /** - * Check that an identifier does not mix numbers. + * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39 + * section 5.3. * - * @internal - * @deprecated This API is ICU internal only. + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated public static final int MIXED_NUMBERS = 128; // Update CheckResult.toString() when a new check is added. @@ -383,9 +462,8 @@ public class SpoofChecker { */ public static final int ALL_CHECKS = 0xFFFFFFFF; - - // Magic number for sanity checking spoof binary resource data. - static final int MAGIC = 0x3845fdef; + // Used for checking for ASCII-Only restriction level + static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); /** * private constructor: a SpoofChecker has to be built by the builder @@ -429,10 +507,10 @@ public class SpoofChecker { */ public Builder(SpoofChecker src) { fChecks = src.fChecks; - fSpoofData = src.fSpoofData; // For the data, we will either use the source data - // as-is, or drop the builder's reference to it - // and generate new data, depending on what our - // caller does with the builder. + fSpoofData = src.fSpoofData; // For the data, we will either use the source data + // as-is, or drop the builder's reference to it + // and generate new data, depending on what our + // caller does with the builder. fAllowedCharsSet.set(src.fAllowedCharsSet); fAllowedLocales.addAll(src.fAllowedLocales); fRestrictionLevel = src.fRestrictionLevel; @@ -445,19 +523,20 @@ public class SpoofChecker { * @stable ICU 4.6 */ public SpoofChecker build() { - if (fSpoofData == null) { // read binary file + // TODO: Make this data loading be lazy (see #12696). + if (fSpoofData == null) { + // read binary file fSpoofData = SpoofData.getDefault(); } // Copy all state from the builder to the new SpoofChecker. - // Make sure that everything is either cloned or copied, so - // that subsequent re-use of the builder won't modify the built - // SpoofChecker. + // Make sure that everything is either cloned or copied, so + // that subsequent re-use of the builder won't modify the built + // SpoofChecker. // - // One exception to this: the SpoofData is just assigned. - // If the builder subsequently needs to modify fSpoofData - // it will create a new SpoofData object first. - + // One exception to this: the SpoofData is just assigned. + // If the builder subsequently needs to modify fSpoofData + // it will create a new SpoofData object first. SpoofChecker result = new SpoofChecker(); result.fChecks = this.fChecks; @@ -470,35 +549,77 @@ public class SpoofChecker { } /** - * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data - * files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source - * data is as described in UAX 39 for these files, and the content of these files is acceptable input. + * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file + * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for + * these files, and the content of these files is acceptable input. * * @param confusables * the Reader of confusable characters definitions, as found in file confusables.txt from * unicode.org. - * @param confusablesWholeScript - * the Reader of whole script confusables definitions, as found in the file - * xonfusablesWholeScript.txt from unicode.org. * @throws ParseException * To report syntax errors in the input. - * @stable ICU 4.6 + * + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, - java.io.IOException { + public Builder setData(Reader confusables) throws ParseException, IOException { // Compile the binary data from the source (text) format. - // Drop the builder's reference to any pre-existing data, which may - // be in use in an already-built checker. + // Drop the builder's reference to any pre-existing data, which may + // be in use in an already-built checker. fSpoofData = new SpoofData(); ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData); - WSConfusableDataBuilder.buildWSConfusableData(confusablesWholeScript, fSpoofData); return this; } /** - * Specify the set of checks that will be performed by the check functions of this Spoof Checker. + * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead. + * + * @param confusables + * the Reader of confusable characters definitions, as found in file confusables.txt from + * unicode.org. + * @param confusablesWholeScript + * No longer supported. + * @throws ParseException + * To report syntax errors in the input. + * + * @deprecated ICU 58 + */ + @Deprecated + public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException { + setData(confusables); + return this; + } + + /** + * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method + * overwrites any checks that may have already been enabled. By default, all checks are enabled. + * + * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For + * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and + * also strings that contain digits from mixed numbering systems: + * + *

+         * {@code
+         * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
+         * }
+         * 
+ * + * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from + * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality, + * it is good practice to disable the CONFUSABLE check: + * + *
+         * {@code
+         * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
+         * }
+         * 
+ * + * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and + * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they + * enable onto the existing bitmask specified by this method. For more details, see the documentation of those + * methods. * * @param checks * The set of checks that this spoof checker will perform. The value is an 'or' of the desired @@ -527,15 +648,15 @@ public class SpoofChecker { * * Supplying an empty string removes all restrictions; characters from any script will be allowed. * - * The CHAR_LIMIT test is automatically enabled for this SpoofChecker when calling this function with a + * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a * non-empty list of locales. * - * The Unicode Set of characters that will be allowed is accessible via the getAllowedChars() function. + * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function. * setAllowedLocales() will replace any previously applied set of allowed characters. * * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of - * setAllowedLocales() by fetching the resulting set with getAllowedChars(), manipulating it with the Unicode - * Set API, then resetting the spoof detectors limits with setAllowedChars() + * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with + * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}. * * @param locales * A Set of ULocales, from which the language and associated script are extracted. If the locales Set @@ -581,6 +702,7 @@ public class SpoofChecker { * Limit characters that are acceptable in identifiers being checked to those normally used with the languages * associated with the specified locales. Any previously specified list of locales is replaced by the new * settings. + * * @param locales * A Set of Locales, from which the language and associated script are extracted. If the locales Set * is null, no restrictions will be placed on the allowed characters. @@ -616,9 +738,9 @@ public class SpoofChecker { /** * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit * is is replaced by the new settings. This includes limits on characters that were set with the - * setAllowedLocales() function. Note that the RESTRICTED set is useful; + * setAllowedLocales() function. Note that the RESTRICTED set is useful. * - * The CHAR_LIMIT test is automatically enabled for this SpoofChecker by this function. + * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function. * * @param chars * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by @@ -634,330 +756,27 @@ public class SpoofChecker { return this; } - /** - * Set the loosest restriction level allowed. The default if this is not called is HIGHLY_RESTRICTIVE. - * This method also sets RESTRICTION_LEVEL. - * @param restrictionLevel The loosest restriction level allowed. + * Set the loosest restriction level allowed for strings. The default if this is not called is + * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and + * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are + * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}. + * + * @param restrictionLevel + * The loosest restriction level allowed. * @return self - * @internal - * @deprecated This API is ICU internal only. + * @provisional This API might change or be removed in a future release. + * @draft ICU 58 */ - @Deprecated public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { fRestrictionLevel = restrictionLevel; - fChecks |= RESTRICTION_LEVEL; + fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS; return this; } - // Structure for the Whole Script Confusable Data - // See Unicode UAX-39, Unicode Security Mechanisms, for a description of the - // Whole Script confusable data - // - // The data provides mappings from code points to a set of scripts - // that contain characters that might be confused with the code point. - // There are two mappings, one for lower case only, and one for characters - // of any case. - // - // The actual data consists of a utrie2 to map from a code point to an offset, - // and an array of UScriptSets (essentially bit maps) that is indexed - // by the offsets obtained from the Trie. - // - // - - /* - * Internal functions for compiling Whole Script confusable source data into its binary (runtime) form. The - * binary data format is described in uspoof_impl.h - */ - private static class WSConfusableDataBuilder { - - // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt - // Example Lines: - // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O - // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I - // | | | | - // | | | |---- Which table, Any Case or Lower Case (A or L) - // | | |----------Target script. We need this. - // | |----------------Src script. Should match the script of the source - // | code points. Beyond checking that, we don't keep it. - // |--------------------------------Source code points or range. - // - // The expression will match _all_ lines, including erroneous lines. - // The result of the parse is returned via the contents of the (match) groups. - static String parseExp = - "(?m)" + // Multi-line mode - "^([ \\t]*(?:#.*?)?)$" + // A blank or comment line. Matches Group 1. - "|^(?:" + // OR - "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" + // Code point range. Groups 2 and 3. - "\\s*([A-Za-z]+)\\s*;" + // The source script. Group 4. - "\\s*([A-Za-z]+)\\s*;" + // The target script. Group 5. - "\\s*(?:(A)|(L))" + // The table A or L. Group 6 or 7 - "[ \\t]*(?:#.*?)?" + // Trailing commment - ")$|" + // OR - "^(.*?)$"; // An error line. Group 8. - // Any line not matching the preceding - // parts of the expression will match - // this, and thus be flagged as an error - - - static void readWholeFileToString(Reader reader, StringBuffer buffer) throws java.io.IOException { - // Convert the user input data from UTF-8 to char (UTF-16) - LineNumberReader lnr = new LineNumberReader(reader); - do { - String line = lnr.readLine(); - if (line == null) { - break; - } - buffer.append(line); - buffer.append('\n'); - } while (true); - } - - // Build the Whole Script Confusable data - // - static void buildWSConfusableData(Reader confusablesWS, SpoofData dest) - throws ParseException, java.io.IOException { - Pattern parseRegexp = null; - StringBuffer input = new StringBuffer(); - int lineNum = 0; - - ArrayList scriptSets = null; - int rtScriptSetsCount = 2; - - Trie2Writable anyCaseTrie = new Trie2Writable(0, 0); - Trie2Writable lowerCaseTrie = new Trie2Writable(0, 0); - - // The scriptSets vector provides a mapping from TRIE values to the set - // of scripts. - // - // Reserved TRIE values: - // 0: Code point has no whole script confusables. - // 1: Code point is of script Common or Inherited. - // - // These code points do not participate in whole script confusable detection. - // (This is logically equivalent to saying that they contain confusables - // in all scripts) - // - // Because Trie values are indexes into the ScriptSets vector, pre-fill - // vector positions 0 and 1 to avoid conflicts with the reserved values. - - scriptSets = new ArrayList(); - scriptSets.add(null); - scriptSets.add(null); - - readWholeFileToString(confusablesWS, input); - - parseRegexp = Pattern.compile(parseExp); - - // Zap any Byte Order Mark at the start of input. Changing it to a space - // is benign - // given the syntax of the input. - if (input.charAt(0) == 0xfeff) { - input.setCharAt(0, (char) 0x20); - } - - // Parse the input, one line per iteration of this loop. - Matcher matcher = parseRegexp.matcher(input); - while (matcher.find()) { - lineNum++; - if (matcher.start(1) >= 0) { - // this was a blank or comment line. - continue; - } - if (matcher.start(8) >= 0) { - // input file syntax error. - throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": Unrecognized input: " - + matcher.group(), matcher.start()); - } - - // Pick up the start and optional range end code points from the - // parsed line. - int startCodePoint = Integer.parseInt(matcher.group(2), 16); - if (startCodePoint > 0x10ffff) { - throw new ParseException("ConfusablesWholeScript, line " + lineNum - + ": out of range code point: " + matcher.group(2), matcher.start(2)); - } - int endCodePoint = startCodePoint; - if (matcher.start(3) >= 0) { - endCodePoint = Integer.parseInt(matcher.group(3), 16); - } - if (endCodePoint > 0x10ffff) { - throw new ParseException("ConfusablesWholeScript, line " + lineNum - + ": out of range code point: " + matcher.group(3), matcher.start(3)); - } - - // Extract the two script names from the source line. - String srcScriptName = matcher.group(4); - String targScriptName = matcher.group(5); - int srcScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, srcScriptName); - int targScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, targScriptName); - if (srcScript == UScript.INVALID_CODE) { - throw new ParseException("ConfusablesWholeScript, line " + lineNum - + ": Invalid script code t: " + matcher.group(4), matcher.start(4)); - } - if (targScript == UScript.INVALID_CODE) { - throw new ParseException("ConfusablesWholeScript, line " + lineNum - + ": Invalid script code t: " + matcher.group(5), matcher.start(5)); - } - - // select the table - (A) any case or (L) lower case only - Trie2Writable table = anyCaseTrie; - if (matcher.start(7) >= 0) { - table = lowerCaseTrie; - } - - // Build the set of scripts containing confusable characters for - // the code point(s) specified in this input line. - // Sanity check that the script of the source code point is the same - // as the source script indicated in the input file. Failure of this - // check is an error in the input file. - // - // Include the source script in the set (needed for Mixed Script - // Confusable detection). - // - int cp; - for (cp = startCodePoint; cp <= endCodePoint; cp++) { - int setIndex = table.get(cp); - BuilderScriptSet bsset = null; - if (setIndex > 0) { - assert (setIndex < scriptSets.size()); - bsset = scriptSets.get(setIndex); - } else { - bsset = new BuilderScriptSet(); - bsset.codePoint = cp; - bsset.trie = table; - bsset.sset = new ScriptSet(); - setIndex = scriptSets.size(); - bsset.index = setIndex; - bsset.rindex = 0; - scriptSets.add(bsset); - table.set(cp, setIndex); - } - bsset.sset.Union(targScript); - bsset.sset.Union(srcScript); - - int cpScript = UScript.getScript(cp); - if (cpScript != srcScript) { - // status = U_INVALID_FORMAT_ERROR; - throw new ParseException("ConfusablesWholeScript, line " + lineNum - + ": Mismatch between source script and code point " + Integer.toString(cp, 16), - matcher.start(5)); - } - } - } - - // Eliminate duplicate script sets. At this point we have a separate - // script set for every code point that had data in the input file. - // - // We eliminate underlying ScriptSet objects, not the BuildScriptSets - // that wrap them - // - // printf("Number of scriptSets: %d\n", scriptSets.size()); - //int duplicateCount = 0; - rtScriptSetsCount = 2; - for (int outeri = 2; outeri < scriptSets.size(); outeri++) { - BuilderScriptSet outerSet = scriptSets.get(outeri); - if (outerSet.index != outeri) { - // This set was already identified as a duplicate. - // It will not be allocated a position in the runtime array - // of ScriptSets. - continue; - } - outerSet.rindex = rtScriptSetsCount++; - for (int inneri = outeri + 1; inneri < scriptSets.size(); inneri++) { - BuilderScriptSet innerSet = scriptSets.get(inneri); - if (outerSet.sset.equals(innerSet.sset) && outerSet.sset != innerSet.sset) { - innerSet.sset = outerSet.sset; - innerSet.index = outeri; - innerSet.rindex = outerSet.rindex; - //duplicateCount++; - } - // But this doesn't get all. We need to fix the TRIE. - } - } - // printf("Number of distinct script sets: %d\n", - // rtScriptSetsCount); - - // Update the Trie values to be reflect the run time script indexes (after duplicate merging). - // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets - // are unused, which is why the loop index starts at 2.) - for (int i = 2; i < scriptSets.size(); i++) { - BuilderScriptSet bSet = scriptSets.get(i); - if (bSet.rindex != i) { - bSet.trie.set(bSet.codePoint, bSet.rindex); - } - } - - // For code points with script==Common or script==Inherited, - // Set the reserved value of 1 into both Tries. These characters do not participate - // in Whole Script Confusable detection; this reserved value is the means - // by which they are detected. - UnicodeSet ignoreSet = new UnicodeSet(); - ignoreSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); - UnicodeSet inheritedSet = new UnicodeSet(); - inheritedSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); - ignoreSet.addAll(inheritedSet); - for (int rn = 0; rn < ignoreSet.getRangeCount(); rn++) { - int rangeStart = ignoreSet.getRangeStart(rn); - int rangeEnd = ignoreSet.getRangeEnd(rn); - anyCaseTrie.setRange(rangeStart, rangeEnd, 1, true); - lowerCaseTrie.setRange(rangeStart, rangeEnd, 1, true); - } - - // Put the compiled data to the destination SpoofData - dest.fAnyCaseTrie = anyCaseTrie.toTrie2_16(); - dest.fLowerCaseTrie = lowerCaseTrie.toTrie2_16(); - dest.fScriptSets = new ScriptSet[rtScriptSetsCount]; - dest.fScriptSets[0] = new ScriptSet(); - dest.fScriptSets[1] = new ScriptSet(); - - int rindex = 2; - for (int i = 2; i < scriptSets.size(); i++) { - BuilderScriptSet bSet = scriptSets.get(i); - if (bSet.rindex < rindex) { - // We have already put this script set to the output data. - continue; - } - assert (rindex == bSet.rindex); - dest.fScriptSets[rindex] = bSet.sset; - rindex++; - } - } - - // class BuilderScriptSet. Represents the set of scripts (Script Codes) - // containing characters that are confusable with one specific - // code point. - static class BuilderScriptSet { - int codePoint; // The source code point. - Trie2Writable trie; // Any-case or Lower-case Trie. - // These Trie tables are the final result of the - // build. This flag indicates which of the two - // this set of data is for. - - ScriptSet sset; // The set of scripts itself. - - int index; // Index of this set in the Build Time vector - // of script sets. - - int rindex; // Index of this set in the final (runtime) - // array of sets. - - // its underlying sset. - - BuilderScriptSet() { - codePoint = -1; - trie = null; - sset = null; - index = 0; - rindex = 0; - } - } - - } - /* - * ***************************************************************************** - * Internal classes for compililing confusable data into its binary (runtime) form. + * ***************************************************************************** Internal classes for + * compililing confusable data into its binary (runtime) form. * ***************************************************************************** */ // --------------------------------------------------------------------- @@ -968,31 +787,27 @@ public class SpoofChecker { // // The binary structures are described in uspoof_impl.h // - // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA - // tables. Each maps from a int to a String. + // 1. parse the data, making a hash table mapping from a codepoint to a String. // // 2. Sort all of the strings encountered by length, since they will need to // be stored in that order in the final string table. // - // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the + // 3. Build a list of keys (UChar32s) from the mapping table. Sort the // list because that will be the ordering of our runtime table. // // 4. Generate the run time string table. This is generated before the key & value - // tables because we need the string indexes when building those tables. + // table because we need the string indexes when building those tables. // - // 5. Build the run-time key and value tables. These are parallel tables, and + // 5. Build the run-time key and value table. These are parallel tables, and // are built at the same time // class ConfusabledataBuilder - // An instance of this class exists while the confusable data is being built from source. - // It encapsulates the intermediate data structures that are used for building. - // It exports one static function, to do a confusable data build. + // An instance of this class exists while the confusable data is being built from source. + // It encapsulates the intermediate data structures that are used for building. + // It exports one static function, to do a confusable data build. private static class ConfusabledataBuilder { - private Hashtable fSLTable; - private Hashtable fSATable; - private Hashtable fMLTable; - private Hashtable fMATable; + private Hashtable fTable; private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the // four mapping tables. @@ -1001,43 +816,49 @@ public class SpoofChecker { private StringBuffer fStringTable; private ArrayList fKeyVec; private ArrayList fValueVec; - private ArrayList fStringLengthsTable; private SPUStringPool stringPool; private Pattern fParseLine; private Pattern fParseHexNum; private int fLineNum; ConfusabledataBuilder() { - fSLTable = new Hashtable(); - fSATable = new Hashtable(); - fMLTable = new Hashtable(); - fMATable = new Hashtable(); - fKeySet = new UnicodeSet(); - fKeyVec = new ArrayList(); + fTable = new Hashtable(); + fKeySet = new UnicodeSet(); + fKeyVec = new ArrayList(); fValueVec = new ArrayList(); stringPool = new SPUStringPool(); } void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException { StringBuffer fInput = new StringBuffer(); - WSConfusableDataBuilder.readWholeFileToString(confusables, fInput); + + // Convert the user input data from UTF-8 to char (UTF-16) + LineNumberReader lnr = new LineNumberReader(confusables); + do { + String line = lnr.readLine(); + if (line == null) { + break; + } + fInput.append(line); + fInput.append('\n'); + } while (true); // Regular Expression to parse a line from Confusables.txt. The expression will match // any line. What was matched is determined by examining which capture groups have a match. - // Capture Group 1: the source char - // Capture Group 2: the replacement chars - // Capture Group 3-6 the table type, SL, SA, ML, or MA - // Capture Group 7: A blank or comment only line. - // Capture Group 8: A syntactically invalid line. Anything that didn't match before. + // Capture Group 1: the source char + // Capture Group 2: the replacement chars + // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) + // Capture Group 7: A blank or comment only line. + // Capture Group 8: A syntactically invalid line. Anything that didn't match before. // Example Line from the confusables.txt source file: - // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " + // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char - "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) - "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) - "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type - "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment - "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment - "|^(.*?)$"); // OR match any line, which catches illegal lines. + "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) + "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) + "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type + "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment + "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment + "|^(.*?)$"); // OR match any line, which catches illegal lines. // Regular expression for parsing a hex number out of a space-separated list of them. // Capture group 1 gets the number, with spaces removed. @@ -1060,8 +881,9 @@ public class SpoofChecker { if (matcher.start(8) >= 0) { // input file syntax error. // status = U_PARSE_ERROR; - throw new ParseException("Confusables, line " + fLineNum + ": Unrecognized Line: " - + matcher.group(8), matcher.start(8)); + throw new ParseException( + "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8), + matcher.start(8)); } // We have a good input line. Extract the key character and mapping @@ -1069,8 +891,9 @@ public class SpoofChecker { // put them into the appropriate mapping table. int keyChar = Integer.parseInt(matcher.group(1), 16); if (keyChar > 0x10ffff) { - throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: " - + matcher.group(1), matcher.start(1)); + throw new ParseException( + "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1), + matcher.start(1)); } Matcher m = fParseHexNum.matcher(matcher.group(2)); @@ -1078,8 +901,9 @@ public class SpoofChecker { while (m.find()) { int c = Integer.parseInt(m.group(1), 16); if (c > 0x10ffff) { - throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: " - + Integer.toString(c, 16), matcher.start(2)); + throw new ParseException( + "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16), + matcher.start(2)); } mapString.appendCodePoint(c); } @@ -1090,33 +914,10 @@ public class SpoofChecker { // eliminated. SPUString smapString = stringPool.addString(mapString.toString()); - // Add the char . string mapping to the appropriate table. - Hashtable table = - matcher.start(3) >= 0 ? fSLTable : - matcher.start(4) >= 0 ? fSATable : - matcher.start(5) >= 0 ? fMLTable : - matcher.start(6) >= 0 ? fMATable : - null; - assert (table != null); - + // Add the char . string mapping to the table. // For Unicode 8, the SL, SA and ML tables have been discontinued. - // All input data from confusables.txt is tagged MA. - // ICU spoof check functions should ignore the specified table and always - // use this MA Data. - // For now, implement by populating the MA data into all four tables, and - // keep the multiple table implementation in place, in case it comes back - // at some time in the future. - // There is no run time size penalty to keeping the four table implementation - - // the data is shared when it's the same betweeen tables. - - if (table != fMATable) { - throw new ParseException("Confusables, line " + fLineNum + ": Table must be 'MA'.", 0); - } - // table.put(keyChar, smapString); - fSLTable.put(keyChar, smapString); - fSATable.put(keyChar, smapString); - fMLTable.put(keyChar, smapString); - fMATable.put(keyChar, smapString); + // All input data from confusables.txt is tagged MA. + fTable.put(keyChar, smapString); fKeySet.add(keyChar); } @@ -1131,83 +932,62 @@ public class SpoofChecker { // Build up the string array, and record the index of each string therein // in the (build time only) string pool. // Strings of length one are not entered into the strings array. - // At the same time, build up the string lengths table, which records the - // position in the string table of the first string of each length >= 4. // (Strings in the table are sorted by length) stringPool.sort(); fStringTable = new StringBuffer(); - fStringLengthsTable = new ArrayList(); - int previousStringLength = 0; - int previousStringIndex = 0; int poolSize = stringPool.size(); int i; for (i = 0; i < poolSize; i++) { SPUString s = stringPool.getByIndex(i); int strLen = s.fStr.length(); int strIndex = fStringTable.length(); - assert (strLen >= previousStringLength); if (strLen == 1) { // strings of length one do not get an entry in the string table. // Keep the single string character itself here, which is the same // convention that is used in the final run-time string table index. - s.fStrTableIndex = s.fStr.charAt(0); + s.fCharOrStrTableIndex = s.fStr.charAt(0); } else { - if ((strLen > previousStringLength) && (previousStringLength >= 4)) { - fStringLengthsTable.add(previousStringIndex); - fStringLengthsTable.add(previousStringLength); - } - s.fStrTableIndex = strIndex; + s.fCharOrStrTableIndex = strIndex; fStringTable.append(s.fStr); } - previousStringLength = strLen; - previousStringIndex = strIndex; - } - // Make the final entry to the string lengths table. - // (it holds an entry for the _last_ string of each length, so adding - // the - // final one doesn't happen in the main loop because no longer string - // was encountered.) - if (previousStringLength >= 4) { - fStringLengthsTable.add(previousStringIndex); - fStringLengthsTable.add(previousStringLength); } - // Construct the compile-time Key and Value tables + // Construct the compile-time Key and Value table. // - // For each key code point, check which mapping tables it applies to, - // and create the final data for the key & value structures. + // The keys in the Key table follow the format described in uspoof.h for the + // Cfu confusables data structure. // - // The four logical mapping tables are conflated into one combined - // table. - // If multiple logical tables have the same mapping for some key, they - // share a single entry in the combined table. - // If more than one mapping exists for the same key code point, multiple - // entries will be created in the table - - for (String keyCharStr: fKeySet) { + // Starting in ICU 58, each code point has exactly one entry in the data + // structure. + + for (String keyCharStr : fKeySet) { int keyChar = keyCharStr.codePointAt(0); - addKeyEntry(keyChar, fSLTable, SpoofChecker.SL_TABLE_FLAG); - addKeyEntry(keyChar, fSATable, SpoofChecker.SA_TABLE_FLAG); - addKeyEntry(keyChar, fMLTable, SpoofChecker.ML_TABLE_FLAG); - addKeyEntry(keyChar, fMATable, SpoofChecker.MA_TABLE_FLAG); + SPUString targetMapping = fTable.get(keyChar); + assert targetMapping != null; + + int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length()); + int value = targetMapping.fCharOrStrTableIndex; + + fKeyVec.add(key); + fValueVec.add(value); } // Put the assembled data into the destination SpoofData object. // The Key Table - // While copying the keys to the output array, - // also sanity check that the keys are sorted. - + // While copying the keys to the output array, + // also sanity check that the keys are sorted. int numKeys = fKeyVec.size(); dest.fCFUKeys = new int[numKeys]; - int previousKey = 0; - for (i=0; i= (previousKey & 0x00ffffff)); - assert ((key & 0xff000000) != 0); + int codePoint = ConfusableDataUtils.keyToCodePoint(key); + // strictly greater because there can be only one entry per code point + assert codePoint > previousCodePoint; dest.fCFUKeys[i] = key; - previousKey = key; + previousCodePoint = codePoint; } // The Value Table, parallels the key table @@ -1215,167 +995,24 @@ public class SpoofChecker { assert (numKeys == numValues); dest.fCFUValues = new short[numValues]; i = 0; - for (int value:fValueVec) { + for (int value : fValueVec) { assert (value < 0xffff); - dest.fCFUValues[i++] = (short)value; + dest.fCFUValues[i++] = (short) value; } // The Strings Table. - dest.fCFUStrings = fStringTable.toString(); - - - // The String Lengths Table. - - // While copying into the runtime array do some sanity checks on the values - // Each complete entry contains two fields, an index and an offset. - // Lengths should increase with each entry. - // Offsets should be less than the size of the string table. - - int lengthTableLength = fStringLengthsTable.size(); - int previousLength = 0; - - // Note: StringLengthsSize in the raw data is the number of complete entries, - // each consisting of a pair of 16 bit values, hence the divide by 2. - - int stringLengthsSize = lengthTableLength / 2; - dest.fCFUStringLengths = new SpoofData.SpoofStringLengthsElement[stringLengthsSize]; - for (i = 0; i < stringLengthsSize; i += 1) { - int offset = fStringLengthsTable.get(i*2); - int length = fStringLengthsTable.get(i*2 + 1); - assert (offset < dest.fCFUStrings.length()); - assert (length < 40); - assert (length > previousLength); - dest.fCFUStringLengths[i] = new SpoofData.SpoofStringLengthsElement(); - dest.fCFUStringLengths[i].fLastString = offset; - dest.fCFUStringLengths[i].fStrLength = length; - previousLength = length; - } - } - - // Add an entry to the key and value tables being built - // input: data from SLTable, MATable, etc. - // outut: entry added to fKeyVec and fValueVec - // addKeyEntry Construction of the confusable Key and Mapping Values tables. - // This is an intermediate point in the building process. - // We already have the mappings in the hash tables fSLTable, etc. - // This function builds corresponding run-time style table entries into - // fKeyVec and fValueVec - void addKeyEntry(int keyChar, // The key character - Hashtable table, // The table, one of SATable, - // MATable, etc. - int tableFlag) { // One of SA_TABLE_FLAG, etc. - SPUString targetMapping = table.get(keyChar); - if (targetMapping == null) { - // No mapping for this key character. - // (This function is called for all four tables for each key char - // that - // is seen anywhere, so this no entry cases are very much expected.) - return; - } - - // Check whether there is already an entry with the correct mapping. - // If so, simply set the flag in the keyTable saying that the existing - // entry - // applies to the table that we're doing now. - boolean keyHasMultipleValues = false; - int i; - for (i = fKeyVec.size() - 1; i >= 0; i--) { - int key = fKeyVec.get(i); - if ((key & 0x0ffffff) != keyChar) { - // We have now checked all existing key entries for this key - // char (if any) - // without finding one with the same mapping. - break; - } - String mapping = getMapping(i); - if (mapping.equals(targetMapping.fStr)) { - // The run time entry we are currently testing has the correct - // mapping. - // Set the flag in it indicating that it applies to the new - // table also. - key |= tableFlag; - fKeyVec.set(i, key); - return; - } - keyHasMultipleValues = true; - } - - // Need to add a new entry to the binary data being built for this - // mapping. - // Includes adding entries to both the key table and the parallel values - // table. - int newKey = keyChar | tableFlag; - if (keyHasMultipleValues) { - newKey |= SpoofChecker.KEY_MULTIPLE_VALUES; - } - int adjustedMappingLength = targetMapping.fStr.length() - 1; - if (adjustedMappingLength > 3) { - adjustedMappingLength = 3; - } - newKey |= adjustedMappingLength << SpoofChecker.KEY_LENGTH_SHIFT; - - int newData = targetMapping.fStrTableIndex; - - fKeyVec.add(newKey); - fValueVec.add(newData); - - // If the preceding key entry is for the same key character (but with a - // different mapping) - // set the multiple-values flag on it. - if (keyHasMultipleValues) { - int previousKeyIndex = fKeyVec.size() - 2; - int previousKey = fKeyVec.get(previousKeyIndex); - previousKey |= SpoofChecker.KEY_MULTIPLE_VALUES; - fKeyVec.set(previousKeyIndex, previousKey); - } } - // From an index into fKeyVec & fValueVec - // get a String with the corresponding mapping. - String getMapping(int index) { - int key = fKeyVec.get(index); - int value = fValueVec.get(index); - int length = SpoofChecker.getKeyLength(key); - int lastIndexWithLen; - switch (length) { - case 0: - char[] cs = { (char) value }; - return new String(cs); - case 1: - case 2: - return fStringTable.substring(value, value + length + 1); // Note: +1 as optimization - case 3: - length = 0; - int i; - for (i = 0; i < fStringLengthsTable.size(); i += 2) { - lastIndexWithLen = fStringLengthsTable.get(i); - if (value <= lastIndexWithLen) { - length = fStringLengthsTable.get(i + 1); - break; - } - } - assert (length >= 3); - return fStringTable.substring(value, value + length); - default: - assert (false); - } - return ""; - } - - - - - - public static void buildConfusableData(Reader confusables, SpoofData dest) throws java.io.IOException, - ParseException { + public static void buildConfusableData(Reader confusables, SpoofData dest) + throws java.io.IOException, ParseException { ConfusabledataBuilder builder = new ConfusabledataBuilder(); builder.build(confusables, dest); } /* - * ***************************************************************************** - * Internal classes for compiling confusable data into its binary (runtime) form. + * ***************************************************************************** Internal classes for + * compiling confusable data into its binary (runtime) form. * ***************************************************************************** */ // SPUString @@ -1385,13 +1022,13 @@ public class SpoofChecker { private static class SPUString { String fStr; // The actual string. - int fStrTableIndex; // Index into the final runtime data for this string. - // (or, for length 1, the single string char itself, - // there being no string table entry for it.) + int fCharOrStrTableIndex; // Index into the final runtime data for this string. + // (or, for length 1, the single string char itself, + // there being no string table entry for it.) SPUString(String s) { fStr = s; - fStrTableIndex = 0; + fCharOrStrTableIndex = 0; } } @@ -1412,6 +1049,8 @@ public class SpoofChecker { return sL.fStr.compareTo(sR.fStr); } } + + final static SPUStringComparator INSTANCE = new SPUStringComparator(); } // String Pool A utility class for holding the strings that are the result of @@ -1451,7 +1090,7 @@ public class SpoofChecker { // Sort the contents; affects the ordering of getByIndex(). public void sort() { - Collections.sort(fVec, new SPUStringComparator()); + Collections.sort(fVec, SPUStringComparator.INSTANCE); } private Vector fVec; // Elements are SPUString * @@ -1484,8 +1123,8 @@ public class SpoofChecker { } /** - * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on scripts - * have been specified, an empty set will be returned. + * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on + * scripts have been specified, an empty set will be returned. * * setAllowedChars() will reset the list of allowed locales to be empty. * @@ -1501,8 +1140,8 @@ public class SpoofChecker { } /** - * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If no - * limitations on scripts have been specified, an empty set will be returned. + * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If + * no limitations on scripts have been specified, an empty set will be returned. * * @return A set of locales corresponding to the acceptable scripts. * @stable ICU 54 @@ -1530,20 +1169,20 @@ public class SpoofChecker { } /** - * A struct-like class to hold the results of a Spoof Check operation. - * Tells which check(s) have failed. + * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed. * * @stable ICU 4.6 */ public static class CheckResult { /** - * Indicate which of the spoof check(s) has failed. The value is a bitwise OR - * of the constants for the tests in question, SINGLE_SCRIPT_CONFUSABLE, - * MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, and so on. + * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests + * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on. * * @stable ICU 4.6 + * @see Builder#setChecks */ public int checks; + /** * The index of the first string position that failed a check. * @@ -1551,26 +1190,27 @@ public class SpoofChecker { */ @Deprecated public int position; + /** * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null; * - * @internal - * @deprecated This API is ICU internal only. + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated public UnicodeSet numerics; + /** * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null. * - * @internal - * @deprecated This API is ICU internal only. + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated public RestrictionLevel restrictionLevel; /** - * Default constructor - * @stable ICU 4.6 + * Default constructor + * + * @stable ICU 4.6 */ public CheckResult() { checks = 0; @@ -1579,6 +1219,7 @@ public class SpoofChecker { /** * {@inheritDoc} + * * @stable ICU 4.6 */ @Override @@ -1629,8 +1270,7 @@ public class SpoofChecker { * @param text * A String to be checked for possible security issues. * @param checkResult - * Output parameter, indicates which specific tests failed. - * May be null if the information is not wanted. + * Output parameter, indicates which specific tests failed. May be null if the information is not wanted. * @return True there any issue is found with the input string. * @stable ICU 4.8 */ @@ -1644,15 +1284,8 @@ public class SpoofChecker { checkResult.restrictionLevel = null; } - // Allocate an identifier info if needed. - - IdentifierInfo identifierInfo = null; - if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) { - identifierInfo = getIdentifierInfo().setIdentifier(text).setIdentifierProfile(fAllowedCharsSet); - } - - if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) { - RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel(); + if (0 != (this.fChecks & RESTRICTION_LEVEL)) { + RestrictionLevel textRestrictionLevel = getRestrictionLevel(text); if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) { result |= RESTRICTION_LEVEL; } @@ -1661,8 +1294,9 @@ public class SpoofChecker { } } - if (0 != ((this.fChecks) & MIXED_NUMBERS)) { - UnicodeSet numerics = identifierInfo.getNumerics(); + if (0 != (this.fChecks & MIXED_NUMBERS)) { + UnicodeSet numerics = new UnicodeSet(); + getNumerics(text, numerics); if (numerics.size() > 1) { result |= MIXED_NUMBERS; } @@ -1685,86 +1319,49 @@ public class SpoofChecker { } } - if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | INVISIBLE))) { - // These are the checks that need to be done on NFD input + if (0 != (this.fChecks & INVISIBLE)) { + // This check needs to be done on NFD input String nfdText = nfdNormalizer.normalize(text); - if (0 != (this.fChecks & INVISIBLE)) { - - // scan for more than one occurence of the same non-spacing mark - // in a sequence of non-spacing marks. - int i; - int c; - int firstNonspacingMark = 0; - boolean haveMultipleMarks = false; - UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a - // single combining sequence. - for (i = 0; i < length;) { - c = Character.codePointAt(nfdText, i); - i = Character.offsetByCodePoints(nfdText, i, 1); - if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { - firstNonspacingMark = 0; - if (haveMultipleMarks) { - marksSeenSoFar.clear(); - haveMultipleMarks = false; - } - continue; - } - if (firstNonspacingMark == 0) { - firstNonspacingMark = c; - continue; - } - if (!haveMultipleMarks) { - marksSeenSoFar.add(firstNonspacingMark); - haveMultipleMarks = true; - } - if (marksSeenSoFar.contains(c)) { - // report the error, and stop scanning. - // No need to find more than the first failure. - result |= INVISIBLE; - break; + // scan for more than one occurrence of the same non-spacing mark + // in a sequence of non-spacing marks. + int i; + int c; + int firstNonspacingMark = 0; + boolean haveMultipleMarks = false; + UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a + // single combining sequence. + for (i = 0; i < length;) { + c = Character.codePointAt(nfdText, i); + i = Character.offsetByCodePoints(nfdText, i, 1); + if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { + firstNonspacingMark = 0; + if (haveMultipleMarks) { + marksSeenSoFar.clear(); + haveMultipleMarks = false; } - marksSeenSoFar.add(c); + continue; } - } - - if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE))) { - // The basic test is the same for both whole and mixed script confusables. - // Compute the set of scripts that every input character has a confusable in. - // For this computation an input character is always considered to be - // confusable with itself in its own script. - // - // If the number of such scripts is two or more, and the input consisted of - // characters all from a single script, we have a whole script confusable. - // (The two scripts will be the original script and the one that is confusable). - - // If the number of such scripts >= one, and the original input contained characters from - // more than one script, we have a mixed script confusable. (We can transform - // some of the characters, and end up with a visually similar string all in one script.) - - if (identifierInfo == null) { - identifierInfo = getIdentifierInfo(); - identifierInfo.setIdentifier(text); + if (firstNonspacingMark == 0) { + firstNonspacingMark = c; + continue; } - int scriptCount = identifierInfo.getScriptCount(); - - ScriptSet scripts = new ScriptSet(); - this.wholeScriptCheck(nfdText, scripts); - int confusableScriptCount = scripts.countMembers(); - - if ((0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 2 && scriptCount == 1) { - result |= WHOLE_SCRIPT_CONFUSABLE; + if (!haveMultipleMarks) { + marksSeenSoFar.add(firstNonspacingMark); + haveMultipleMarks = true; } - - if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 1 && scriptCount > 1) { - result |= MIXED_SCRIPT_CONFUSABLE; + if (marksSeenSoFar.contains(c)) { + // report the error, and stop scanning. + // No need to find more than the first failure. + result |= INVISIBLE; + break; } + marksSeenSoFar.add(c); } } if (checkResult != null) { checkResult.checks = result; } - releaseIdentifierInfo(identifierInfo); return (0 != result); } @@ -1806,63 +1403,42 @@ public class SpoofChecker { // and for definitions of the types (single, whole, mixed-script) of confusables. // We only care about a few of the check flags. Ignore the others. - // If no tests relavant to this function have been specified, signal an error. + // If no tests relevant to this function have been specified, signal an error. // TODO: is this really the right thing to do? It's probably an error on // the caller's part, but logically we would just return 0 (no error). - if ((this.fChecks & (SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE)) == 0) { + if ((this.fChecks & CONFUSABLE) == 0) { throw new IllegalArgumentException("No confusable checks are enabled."); } - int flagsForSkeleton = this.fChecks & ANY_CASE; - int result = 0; - IdentifierInfo identifierInfo = getIdentifierInfo(); - identifierInfo.setIdentifier(s1); - int s1ScriptCount = identifierInfo.getScriptCount(); - int s1FirstScript = identifierInfo.getScripts().nextSetBit(0); - identifierInfo.setIdentifier(s2); - int s2ScriptCount = identifierInfo.getScriptCount(); - int s2FirstScript = identifierInfo.getScripts().nextSetBit(0); - releaseIdentifierInfo(identifierInfo); - - if (0 != (this.fChecks & SINGLE_SCRIPT_CONFUSABLE)) { - // Do the Single Script compare. - if (s1ScriptCount <= 1 && s2ScriptCount <= 1 && s1FirstScript == s2FirstScript) { - flagsForSkeleton |= SINGLE_SCRIPT_CONFUSABLE; - String s1Skeleton = getSkeleton(flagsForSkeleton, s1); - String s2Skeleton = getSkeleton(flagsForSkeleton, s2); - if (s1Skeleton.equals(s2Skeleton)) { - result |= SINGLE_SCRIPT_CONFUSABLE; - } - } + // Compute the skeletons and check for confusability. + String s1Skeleton = getSkeleton(s1); + String s2Skeleton = getSkeleton(s2); + if (!s1Skeleton.equals(s2Skeleton)) { + return 0; } - if (0 != (result & SINGLE_SCRIPT_CONFUSABLE)) { - // If the two inputs are single script confusable they cannot also be - // mixed or whole script confusable, according to the UAX39 definitions. - // So we can skip those tests. - return result; - } + // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes + // of confusables according to UTS 39 section 4. + // Start by computing the resolved script sets of s1 and s2. + ScriptSet s1RSS = new ScriptSet(); + getResolvedScriptSet(s1, s1RSS); + ScriptSet s2RSS = new ScriptSet(); + getResolvedScriptSet(s2, s2RSS); - // Two identifiers are whole script confusable if each is of a single script - // and they are mixed script confusable. - boolean possiblyWholeScriptConfusables = s1ScriptCount <= 1 && s2ScriptCount <= 1 - && (0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE)); - - // Mixed Script Check - if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) || possiblyWholeScriptConfusables) { - // For getSkeleton(), resetting the SINGLE_SCRIPT_CONFUSABLE flag will get us - // the mixed script table skeleton, which is what we want. - // The Any Case / Lower Case bit in the skelton flags was set at the top of the function. - flagsForSkeleton &= ~SINGLE_SCRIPT_CONFUSABLE; - String s1Skeleton = getSkeleton(flagsForSkeleton, s1); - String s2Skeleton = getSkeleton(flagsForSkeleton, s2); - if (s1Skeleton.equals(s2Skeleton)) { - result |= MIXED_SCRIPT_CONFUSABLE; - if (possiblyWholeScriptConfusables) { - result |= WHOLE_SCRIPT_CONFUSABLE; - } + // Turn on all applicable flags + int result = 0; + if (s1RSS.intersects(s2RSS)) { + result |= SINGLE_SCRIPT_CONFUSABLE; + } else { + result |= MIXED_SCRIPT_CONFUSABLE; + if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { + result |= WHOLE_SCRIPT_CONFUSABLE; } } + + // Turn off flags that the user doesn't want + result &= fChecks; + return result; } @@ -1873,307 +1449,243 @@ public class SpoofChecker { * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. * - * Skeletons are computed using the algorithm and data describe in Unicode UAX 39. - * The latest proposed update, UAX 39 Version 8 draft 1, says "the tables SL, SA, and ML - * were still problematic, and discouraged from use in [Uniocde] 7.0. - * They were thus removed from version 8.0" + * Skeletons are computed using the algorithm and data described in Unicode UAX 39. * - * In light of this, the default mapping data included with ICU 55 uses the - * Unicode 7 MA (Multi script Any case) table data for the other type options - * (Single Script, Any Case), (Single Script, Lower Case) and (Multi Script, Lower Case). - * - * @param type - * The type of skeleton, corresponding to which of the Unicode confusable data tables to use. The default - * is Mixed-Script, Lowercase. Allowed options are SINGLE_SCRIPT_CONFUSABLE and ANY_CASE_CONFUSABLE. The - * two flags may be ORed. - * @param id - * The input identifier whose skeleton will be genereated. + * @param str + * The input string whose skeleton will be generated. * @return The output skeleton string. * - * @stable ICU 4.6 + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - public String getSkeleton(int type, String id) { - int tableMask = 0; - switch (type) { - case 0: - tableMask = ML_TABLE_FLAG; - break; - case SINGLE_SCRIPT_CONFUSABLE: - tableMask = SL_TABLE_FLAG; - break; - case ANY_CASE: - tableMask = MA_TABLE_FLAG; - break; - case SINGLE_SCRIPT_CONFUSABLE | ANY_CASE: - tableMask = SA_TABLE_FLAG; - break; - default: - // *status = U_ILLEGAL_ARGUMENT_ERROR; - throw new IllegalArgumentException("SpoofChecker.getSkeleton(), bad type value."); - } - + public String getSkeleton(CharSequence str) { // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a String. - - String nfdId = nfdNormalizer.normalize(id); + String nfdId = nfdNormalizer.normalize(str); int normalizedLen = nfdId.length(); StringBuilder skelSB = new StringBuilder(); for (int inputIndex = 0; inputIndex < normalizedLen;) { int c = Character.codePointAt(nfdId, inputIndex); inputIndex += Character.charCount(c); - this.confusableLookup(c, tableMask, skelSB); + this.fSpoofData.confusableLookup(c, skelSB); } String skelStr = skelSB.toString(); skelStr = nfdNormalizer.normalize(skelStr); return skelStr; } - /** - * Equality function. Return true if the two SpoofChecker objects - * incorporate the same confusable data and have enabled the same - * set of checks. + * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been + * ignored, and starting with ICU 58, this function has been deprecated. + * + * @param type + * No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA. + * @param id + * The input identifier whose skeleton will be generated. + * @return The output skeleton string. * - * @param other the SpoofChecker being compared with. - * @return true if the two SpoofCheckers are equal. - * @internal - * @deprecated This API is ICU internal only. + * @deprecated ICU 58 */ @Deprecated + public String getSkeleton(int type, CharSequence id) { + return getSkeleton(id); + } + + /** + * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have + * enabled the same set of checks. + * + * @param other + * the SpoofChecker being compared with. + * @return true if the two SpoofCheckers are equal. + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. + */ @Override public boolean equals(Object other) { - if (!(other instanceof SpoofChecker)) {return false; } - SpoofChecker otherSC = (SpoofChecker)other; - if (fSpoofData != otherSC.fSpoofData && - fSpoofData != null && - !fSpoofData.equals(otherSC.fSpoofData)) { + if (!(other instanceof SpoofChecker)) { + return false; + } + SpoofChecker otherSC = (SpoofChecker) other; + if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) { return false; } - if (fChecks != otherSC.fChecks) {return false; } - if (fAllowedLocales != otherSC.fAllowedLocales && - fAllowedLocales != null && - !fAllowedLocales.equals(otherSC.fAllowedLocales)) { + if (fChecks != otherSC.fChecks) { return false; } - if (fAllowedCharsSet != otherSC.fAllowedCharsSet && - fAllowedCharsSet != null && - !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { + if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null + && !fAllowedLocales.equals(otherSC.fAllowedLocales)) { + return false; + } + if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null + && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { return false; } if (fRestrictionLevel != otherSC.fRestrictionLevel) { return false; } return true; - } + } /** - * This is a stub implementation and not designed for generic use. - * @internal - * @deprecated This API is ICU internal only. + * @draft ICU 58 + * @provisional This API might change or be removed in a future release. */ - @Deprecated @Override public int hashCode() { - assert false; // To make sure ICU implementation does not depend on this. - return 1234; // Any arbitrary value - for now, using 1234. + return fChecks + ^ fSpoofData.hashCode() + ^ fAllowedLocales.hashCode() + ^ fAllowedCharsSet.hashCode() + ^ fRestrictionLevel.ordinal(); } - /* - * Append the confusable skeleton transform for a single code point to a StringBuilder. - * The string to be appended will between 1 and 18 characters. - * - * This is the heart of the confusable skeleton generation implementation. - * - * @param tableMask bit flag specifying which confusable table to use. One of SL_TABLE_FLAG, MA_TABLE_FLAG, etc. + /** + * Computes the augmented script set for a code point, according to UTS 39 section 5.1. */ - private void confusableLookup(int inChar, int tableMask, StringBuilder dest) { - // Binary search the spoof data key table for the inChar - int low = 0; - int mid = 0; - int limit = fSpoofData.fCFUKeys.length; - int midc; - boolean foundChar = false; - // [low, limit), i.e low is inclusive, limit is exclusive - do { - int delta = (limit - low) / 2; - mid = low + delta; - midc = fSpoofData.fCFUKeys[mid] & 0x1fffff; - if (inChar == midc) { - foundChar = true; - break; - } else if (inChar < midc) { - limit = mid; // limit is exclusive - } else { - // we have checked mid is not the char we looking for, the next char - // we want to check is (mid + 1) - low = mid + 1; // low is inclusive - } - } while (low < limit); - if (!foundChar) { // Char not found. It maps to itself. - dest.appendCodePoint(inChar); - return; + private static void getAugmentedScriptSet(int codePoint, ScriptSet result) { + result.clear(); + UScript.getScriptExtensions(codePoint, result); + + // Section 5.1 step 1 + if (result.get(UScript.HAN)) { + result.set(UScript.HAN_WITH_BOPOMOFO); + result.set(UScript.JAPANESE); + result.set(UScript.KOREAN); + } + if (result.get(UScript.HIRAGANA)) { + result.set(UScript.JAPANESE); + } + if (result.get(UScript.KATAKANA)) { + result.set(UScript.JAPANESE); + } + if (result.get(UScript.HANGUL)) { + result.set(UScript.KOREAN); + } + if (result.get(UScript.BOPOMOFO)) { + result.set(UScript.HAN_WITH_BOPOMOFO); } - boolean foundKey = false; - int keyFlags = fSpoofData.fCFUKeys[mid] & 0xff000000; - if ((keyFlags & tableMask) == 0) { - // We found the right key char, but the entry doesn't pertain to the - // table we need. See if there is an adjacent key that does - if (0 != (keyFlags & SpoofChecker.KEY_MULTIPLE_VALUES)) { - int altMid; - for (altMid = mid - 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid--) { - keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000; - if (0 != (keyFlags & tableMask)) { - mid = altMid; - foundKey = true; - break; - } - } - if (!foundKey) { - for (altMid = mid + 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid++) { - keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000; - if (0 != (keyFlags & tableMask)) { - mid = altMid; - foundKey = true; - break; - } - } - } - } - if (!foundKey) { - // No key entry for this char & table. - // The input char maps to itself. - dest.appendCodePoint(inChar); - return; - } + // Section 5.1 step 2 + if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) { + result.setAll(); } + } - int stringLen = getKeyLength(keyFlags) + 1; - int keyTableIndex = mid; + /** + * Computes the resolved script set for a string, according to UTS 39 section 5.1. + */ + private void getResolvedScriptSet(CharSequence input, ScriptSet result) { + getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result); + } - // Value is either a char (for strings of length 1) or - // an index into the string table (for longer strings) - short value = fSpoofData.fCFUValues[keyTableIndex]; - if (stringLen == 1) { - dest.append((char) value); - return; - } + /** + * Computes the resolved script set for a string, omitting characters having the specified script. If + * UScript.CODE_LIMIT is passed as the second argument, all characters are included. + */ + private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) { + result.setAll(); - // String length of 4 from the above lookup is used for all strings of - // length >= 4. - // For these, get the real length from the string lengths table, - // which maps string table indexes to lengths. - // All strings of the same length are stored contiguously in the string table. - // 'value' from the lookup above is the starting index for the desired string. + ScriptSet temp = new ScriptSet(); + for (int utf16Offset = 0; utf16Offset < input.length();) { + int codePoint = Character.codePointAt(input, utf16Offset); + utf16Offset += Character.charCount(codePoint); - if (stringLen == 4) { - boolean dataOK = false; - for (SpoofData.SpoofStringLengthsElement el: fSpoofData.fCFUStringLengths) { - if (el.fLastString >= value) { - stringLen = el.fStrLength; - dataOK = true; - break; - } + // Compute the augmented script set for the character + getAugmentedScriptSet(codePoint, temp); + + // Intersect the augmented script set with the resolved script set, but only if the character doesn't + // have the script specified in the function call + if (script == UScript.CODE_LIMIT || !temp.get(script)) { + result.and(temp); } - assert(dataOK); } - - dest.append(fSpoofData.fCFUStrings, value, value + stringLen); - return; } - // Implementation for Whole Script tests. - // Input text is already normalized to NFD - // Return the set of scripts, each of which can represent something that is - // confusable with the input text. The script of the input text - // is included; input consisting of characters from a single script will - // always produce a result consisting of a set containing that script. - private void wholeScriptCheck(CharSequence text, ScriptSet result) { - int inputIdx = 0; - int c; - - Trie2 table = (0 != (fChecks & ANY_CASE)) ? fSpoofData.fAnyCaseTrie : fSpoofData.fLowerCaseTrie; - result.setAll(); - while (inputIdx < text.length()) { - c = Character.codePointAt(text, inputIdx); - inputIdx = Character.offsetByCodePoints(text, inputIdx, 1); - int index = table.get(c); - if (index == 0) { - // No confusables in another script for this char. - // TODO: we should change the data to have sets with just the single script - // bit for the script of this char. Gets rid of this special case. - // Until then, grab the script from the char and intersect it with the set. - int cpScript = UScript.getScript(c); - assert (cpScript > UScript.INHERITED); - result.intersect(cpScript); - } else if (index == 1) { - // Script == Common or Inherited. Nothing to do. - } else { - result.intersect(fSpoofData.fScriptSets[index]); + /** + * Computes the set of numerics for a string, according to UTS 39 section 5.3. + */ + private void getNumerics(String input, UnicodeSet result) { + result.clear(); + + for (int utf16Offset = 0; utf16Offset < input.length();) { + int codePoint = Character.codePointAt(input, utf16Offset); + utf16Offset += Character.charCount(codePoint); + + // Store a representative character for each kind of decimal digit + if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { + // Store the zero character as a representative for comparison. + // Unicode guarantees it is codePoint - value + result.add(codePoint - UCharacter.getNumericValue(codePoint)); } } } - // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create. - // Maintain a one-element cache, which is sufficient to avoid repeatedly - // creating new ones unless we get multi-thread concurrency collisions in spoof - // check operations, which should be statistically uncommon. - - private IdentifierInfo fCachedIdentifierInfo = null; // Do not use this directly. + /** + * Computes the restriction level of a string, according to UTS 39 section 5.2. + */ + private RestrictionLevel getRestrictionLevel(String input) { + // Section 5.2 step 1: + if (!fAllowedCharsSet.containsAll(input)) { + return RestrictionLevel.UNRESTRICTIVE; + } - private IdentifierInfo getIdentifierInfo() { - IdentifierInfo returnIdInfo = null; - synchronized (this) { - returnIdInfo = fCachedIdentifierInfo; - fCachedIdentifierInfo = null; + // Section 5.2 step 2: + if (ASCII.containsAll(input)) { + return RestrictionLevel.ASCII; } - if (returnIdInfo == null) { - returnIdInfo = new IdentifierInfo(); + + // Section 5.2 steps 3: + ScriptSet resolvedScriptSet = new ScriptSet(); + getResolvedScriptSet(input, resolvedScriptSet); + + // Section 5.2 step 4: + if (!resolvedScriptSet.isEmpty()) { + return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; } - return returnIdInfo; - } + // Section 5.2 step 5: + ScriptSet resolvedNoLatn = new ScriptSet(); + getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn); - private void releaseIdentifierInfo(IdentifierInfo idInfo) { - if (idInfo != null) { - synchronized (this) { - if (fCachedIdentifierInfo == null) { - fCachedIdentifierInfo = idInfo; - } - } + // Section 5.2 step 6: + if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE) + || resolvedNoLatn.get(UScript.KOREAN)) { + return RestrictionLevel.HIGHLY_RESTRICTIVE; + } + + // Section 5.2 step 7: + if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK) + && !resolvedNoLatn.get(UScript.CHEROKEE)) { + return RestrictionLevel.MODERATELY_RESTRICTIVE; } - }; + + // Section 5.2 step 8: + return RestrictionLevel.MINIMALLY_RESTRICTIVE; + } // Data Members - private int fChecks; // Bit vector of checks to perform. + private int fChecks; // Bit vector of checks to perform. private SpoofData fSpoofData; - private Set fAllowedLocales; // The Set of allowed locales. - private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. + private Set fAllowedLocales; // The Set of allowed locales. + private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. private RestrictionLevel fRestrictionLevel; private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); - // Confusable Mappings Data Structures // // For the confusable data, we are essentially implementing a map, - // key: a code point - // value: a string. Most commonly one char in length, but can be more. + // key: a code point + // value: a string. Most commonly one char in length, but can be more. // // The keys are stored as a sorted array of 32 bit ints. - // bits 0-23 a code point value - // bits 24-31 flags - // 24: 1 if entry applies to SL table - // 25: 1 if entry applies to SA table - // 26: 1 if entry applies to ML table - // 27: 1 if entry applies to MA table - // 28: 1 if there are multiple entries for this code point. - // 29-30: length of value string, in UChars. - // values are (1, 2, 3, other) - // The key table is sorted in ascending code point order. (not on the - // 32 bit int value, the flag bits do not participate in the sorting.) + // bits 0-23 a code point value + // bits 24-31 length of value string, in UChars (between 1 and 256 UChars). + // The key table is sorted in ascending code point order. (not on the + // 32 bit int value, the flag bits do not participate in the sorting.) // - // Lookup is done by means of a binary search in the key table. + // Lookup is done by means of a binary search in the key table. // // The corresponding values are kept in a parallel array of 16 bit ints. // If the value string is of length 1, it is literally in the value array. @@ -2181,98 +1693,78 @@ public class SpoofChecker { // table. // // String Table: - // The strings table contains all of the value strings (those of length two or greater) - // concatentated together into one long char (UTF-16) array. + // The strings table contains all of the value strings (those of length two or greater) + // concatentated together into one long char (UTF-16) array. // - // The array is arranged by length of the strings - all strings of the same length - // are stored together. The sections are ordered by length of the strings - - // all two char strings first, followed by all of the three Char strings, etc. + // The array is arranged by length of the strings - all strings of the same length + // are stored together. The sections are ordered by length of the strings - + // all two char strings first, followed by all of the three Char strings, etc. // - // There is no nul character or other mark between adjacent strings. - // - // String Lengths table - // The length of strings from 1 to 3 is flagged in the key table. - // For strings of length 4 or longer, the string length table provides a - // mapping between an index into the string table and the corresponding length. - // Strings of these lengths are rare, so lookup time is not an issue. - // Each entry consists of - // unsigned short index of the _last_ string with this length - // unsigned short the length - - // Flag bits in the Key entries - static final int SL_TABLE_FLAG = (1 << 24); - static final int SA_TABLE_FLAG = (1 << 25); - static final int ML_TABLE_FLAG = (1 << 26); - static final int MA_TABLE_FLAG = (1 << 27); - static final int KEY_MULTIPLE_VALUES = (1 << 28); - static final int KEY_LENGTH_SHIFT = 29; - - static final int getKeyLength(int x) { - return (((x) >> 29) & 3); - } + // There is no nul character or other mark between adjacent strings. + private static final class ConfusableDataUtils { + public static final int FORMAT_VERSION = 2; // version for ICU 58 + + public static final int keyToCodePoint(int key) { + return key & 0x00ffffff; + } + public static final int keyToLength(int key) { + return ((key & 0xff000000) >> 24) + 1; + } + + public static final int codePointAndLengthToKey(int codePoint, int length) { + assert (codePoint & 0x00ffffff) == codePoint; + assert length <= 256; + return codePoint | ((length - 1) << 24); + } + } // ------------------------------------------------------------------------------------- // // SpoofData // - // This class corresonds to the ICU SpoofCheck data. + // This class corresponds to the ICU SpoofCheck data. // - // The data can originate with the Binary ICU data that is generated in ICU4C, - // or it can originate from source rules that are compiled in ICU4J. + // The data can originate with the Binary ICU data that is generated in ICU4C, + // or it can originate from source rules that are compiled in ICU4J. // - // This class does not include the set of checks to be performed, but only - // data that is serialized into the ICU binary data. + // This class does not include the set of checks to be performed, but only + // data that is serialized into the ICU binary data. // - // Because Java cannot easily wrap binaray data like ICU4C, the binary data is - // copied into Java structures that are convenient for use by the run time code. + // Because Java cannot easily wrap binary data like ICU4C, the binary data is + // copied into Java structures that are convenient for use by the run time code. // // --------------------------------------------------------------------------------------- private static class SpoofData { // The Confusable data, Java data structures for. - int[] fCFUKeys; - short[] fCFUValues; - SpoofStringLengthsElement[] fCFUStringLengths; - String fCFUStrings; - - // Whole Script Confusable Data - Trie2 fAnyCaseTrie; - Trie2 fLowerCaseTrie; - ScriptSet[] fScriptSets; - - static class SpoofStringLengthsElement { - int fLastString; // index in string table of last string with this length - int fStrLength; // Length of strings - @Override - public boolean equals(Object other) { - if (!(other instanceof SpoofStringLengthsElement)) { - return false; - } - SpoofStringLengthsElement otherEl = (SpoofStringLengthsElement)other; - return fLastString == otherEl.fLastString && - fStrLength == otherEl.fStrLength; - } - } + int[] fCFUKeys; + short[] fCFUValues; + String fCFUStrings; - private static final int DATA_FORMAT = 0x43667520; // "Cfu " + private static final int DATA_FORMAT = 0x43667520; // "Cfu " private static final class IsAcceptable implements Authenticate { - // @Override when we switch to Java 6 @Override public boolean isDataVersionAcceptable(byte version[]) { - return version[0] == 1; + return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0 + || version[3] != 0; } } + private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final class DefaultData { private static SpoofData INSTANCE = null; + private static IOException EXCEPTION = null; static { + // Note: Although this is static, the Java runtime can delay execution of this block until + // the data is actually requested via SpoofData.getDefault(). try { INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu")); - } catch (IOException ignored) { + } catch (IOException e) { + EXCEPTION = e; } } } @@ -2280,18 +1772,23 @@ public class SpoofChecker { /** * @return instance for Unicode standard data */ - static SpoofData getDefault() { + public static SpoofData getDefault() { + if (DefaultData.EXCEPTION != null) { + throw new MissingResourceException( + "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(), + "SpoofChecker", ""); + } return DefaultData.INSTANCE; } // SpoofChecker Data constructor for use from data builder. // Initializes a new, empty data area that will be populated later. - SpoofData() { + private SpoofData() { } // Constructor for use when creating from prebuilt default data. // A ByteBuffer is what the ICU internal data loading functions provide. - SpoofData(ByteBuffer bytes) throws java.io.IOException { + private SpoofData(ByteBuffer bytes) throws java.io.IOException { ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); bytes.mark(); readData(bytes); @@ -2302,62 +1799,45 @@ public class SpoofChecker { if (!(other instanceof SpoofData)) { return false; } - SpoofData otherData = (SpoofData)other; - if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) return false; - if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) return false; - if (!Arrays.deepEquals(fCFUStringLengths, otherData.fCFUStringLengths)) return false; - if (fCFUStrings != otherData.fCFUStrings && - fCFUStrings != null && - !fCFUStrings.equals(otherData.fCFUStrings)) return false; - if (fAnyCaseTrie != otherData.fAnyCaseTrie && - fAnyCaseTrie != null && - !fAnyCaseTrie.equals(otherData.fAnyCaseTrie)) return false; - if (fLowerCaseTrie != otherData.fLowerCaseTrie && - fLowerCaseTrie != null && - !fLowerCaseTrie.equals(otherData.fLowerCaseTrie)) return false; - if (!Arrays.deepEquals(fScriptSets, otherData.fScriptSets)) return false; + SpoofData otherData = (SpoofData) other; + if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) + return false; + if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) + return false; + if (fCFUStrings != otherData.fCFUStrings && fCFUStrings != null + && !fCFUStrings.equals(otherData.fCFUStrings)) + return false; return true; } + @Override + public int hashCode() { + return Arrays.hashCode(fCFUKeys) + ^ Arrays.hashCode(fCFUValues) + ^ fCFUStrings.hashCode(); + } + // Set the SpoofChecker data from pre-built binary data in a byte buffer. // The binary data format is as described for ICU4C spoof data. // - void readData(ByteBuffer bytes) throws java.io.IOException { + private void readData(ByteBuffer bytes) throws java.io.IOException { int magic = bytes.getInt(); if (magic != 0x3845fdef) { throw new IllegalArgumentException("Bad Spoof Check Data."); } @SuppressWarnings("unused") - int dataFormatVersion = bytes.getInt(); + int dataFormatVersion = bytes.getInt(); @SuppressWarnings("unused") - int dataLength = bytes.getInt(); - - int CFUKeysOffset = bytes.getInt(); - int CFUKeysSize = bytes.getInt(); - - int CFUValuesOffset = bytes.getInt(); - int CFUValuesSize = bytes.getInt(); - - int CFUStringTableOffset = bytes.getInt(); - int CFUStringTableSize = bytes.getInt(); + int dataLength = bytes.getInt(); - int CFUStringLengthsOffset = bytes.getInt(); - int CFUStringLengthsSize = bytes.getInt(); + int CFUKeysOffset = bytes.getInt(); + int CFUKeysSize = bytes.getInt(); - int anyCaseTrieOffset = bytes.getInt(); - /*int anyCaseTrieSize =*/ bytes.getInt(); + int CFUValuesOffset = bytes.getInt(); + int CFUValuesSize = bytes.getInt(); - int lowerCaseTrieOffset = bytes.getInt(); - /*int lowerCaseTrieLength =*/ bytes.getInt(); - - int scriptSetsOffset = bytes.getInt(); - int scriptSetslength = bytes.getInt(); - - int i; - fCFUKeys = null; - fCFUValues = null; - fCFUStringLengths = null; - fCFUStrings = null; + int CFUStringTableOffset = bytes.getInt(); + int CFUStringTableSize = bytes.getInt(); // We have now read the file header, and obtained the position for each // of the data items. Now read each in turn, first seeking the @@ -2374,131 +1854,170 @@ public class SpoofChecker { bytes.reset(); ICUBinary.skipBytes(bytes, CFUStringTableOffset); fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0); + } - bytes.reset(); - ICUBinary.skipBytes(bytes, CFUStringLengthsOffset); - fCFUStringLengths = new SpoofStringLengthsElement[CFUStringLengthsSize]; - for (i = 0; i < CFUStringLengthsSize; i++) { - fCFUStringLengths[i] = new SpoofStringLengthsElement(); - fCFUStringLengths[i].fLastString = bytes.getShort(); - fCFUStringLengths[i].fStrLength = bytes.getShort(); + /** + * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be + * appended will between 1 and 18 characters as of Unicode 9. + * + * This is the heart of the confusable skeleton generation implementation. + */ + public void confusableLookup(int inChar, StringBuilder dest) { + // Perform a binary search. + // [lo, hi), i.e lo is inclusive, hi is exclusive. + // The result after the loop will be in lo. + int lo = 0; + int hi = length(); + do { + int mid = (lo + hi) / 2; + if (codePointAt(mid) > inChar) { + hi = mid; + } else if (codePointAt(mid) < inChar) { + lo = mid; + } else { + // Found result. Break early. + lo = mid; + break; + } + } while (hi - lo > 1); + + // Did we find an entry? If not, the char maps to itself. + if (codePointAt(lo) != inChar) { + dest.appendCodePoint(inChar); + return; } - bytes.reset(); - ICUBinary.skipBytes(bytes, anyCaseTrieOffset); - fAnyCaseTrie = Trie2.createFromSerialized(bytes); + // Add the element to the string builder and return. + appendValueTo(lo, dest); + return; + } - bytes.reset(); - ICUBinary.skipBytes(bytes, lowerCaseTrieOffset); - fLowerCaseTrie = Trie2.createFromSerialized(bytes); + /** + * Return the number of confusable entries in this SpoofData. + * + * @return The number of entries. + */ + public int length() { + return fCFUKeys.length; + } - bytes.reset(); - ICUBinary.skipBytes(bytes, scriptSetsOffset); - fScriptSets = new ScriptSet[scriptSetslength]; - for (i = 0; i < scriptSetslength; i++) { - fScriptSets[i] = new ScriptSet(bytes); - } + /** + * Return the code point (key) at the specified index. + * + * @param index + * The index within the SpoofData. + * @return The code point. + */ + public int codePointAt(int index) { + return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]); } + /** + * Append the confusable skeleton at the specified index to the StringBuilder dest. + * + * @param index + * The index within the SpoofData. + * @param dest + * The StringBuilder to which to append the skeleton. + */ + public void appendValueTo(int index, StringBuilder dest) { + int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]); + + // Value is either a char (for strings of length 1) or + // an index into the string table (for longer strings) + short value = fCFUValues[index]; + if (stringLength == 1) { + dest.append((char) value); + } else { + dest.append(fCFUStrings, value, value + stringLength); + } + } } // ------------------------------------------------------------------------------- // // ScriptSet - Script code bit sets. Used with the whole script confusable data. // Used both at data build and at run time. - // Could almost be a Java BitSet, except that the input and output would - // be awkward. + // Extends Java BitSet with input/output support and a few helper methods. + // Note: The I/O is not currently being used, so it has been commented out. If + // it is needed again, the code can be restored. // // ------------------------------------------------------------------------------- - static class ScriptSet { - public ScriptSet() { - } - - public ScriptSet(ByteBuffer bytes) throws java.io.IOException { - for (int j = 0; j < bits.length; j++) { - bits[j] = bytes.getInt(); - } - } - - public void output(DataOutputStream os) throws java.io.IOException { - for (int i = 0; i < bits.length; i++) { - os.writeInt(bits[i]); - } - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof ScriptSet)) { - return false; - } - ScriptSet otherSet = (ScriptSet)other; - return Arrays.equals(bits, otherSet.bits); - } + static class ScriptSet extends BitSet { - public void Union(int script) { - int index = script / 32; - int bit = 1 << (script & 31); - assert (index < bits.length * 4 * 4); - bits[index] |= bit; - } + // Eclipse default value to quell warnings: + private static final long serialVersionUID = 1L; - @SuppressWarnings("unused") - public void Union(ScriptSet other) { - for (int i = 0; i < bits.length; i++) { - bits[i] |= other.bits[i]; - } - } - - public void intersect(ScriptSet other) { - for (int i = 0; i < bits.length; i++) { - bits[i] &= other.bits[i]; - } - } - - public void intersect(int script) { - int index = script / 32; - int bit = 1 << (script & 31); - assert (index < bits.length * 4 * 4); - int i; - for (i = 0; i < index; i++) { - bits[i] = 0; - } - bits[index] &= bit; - for (i = index + 1; i < bits.length; i++) { - bits[i] = 0; - } + // // The serialized version of this class can hold INT_CAPACITY * 32 scripts. + // private static final int INT_CAPACITY = 6; + // private static final long serialVersionUID = INT_CAPACITY; + // static { + // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT; + // } + // + // public ScriptSet() { + // } + // + // public ScriptSet(ByteBuffer bytes) throws java.io.IOException { + // for (int i = 0; i < INT_CAPACITY; i++) { + // int bits = bytes.getInt(); + // for (int j = 0; j < Integer.SIZE; j++) { + // if ((bits & (1 << j)) != 0) { + // set(i * Integer.SIZE + j); + // } + // } + // } + // } + // + // public void output(DataOutputStream os) throws java.io.IOException { + // for (int i = 0; i < INT_CAPACITY; i++) { + // int bits = 0; + // for (int j = 0; j < Integer.SIZE; j++) { + // if (get(i * Integer.SIZE + j)) { + // bits |= (1 << j); + // } + // } + // os.writeInt(bits); + // } + // } + + public void and(int script) { + this.clear(0, script); + this.clear(script + 1, UScript.CODE_LIMIT); } public void setAll() { - for (int i = 0; i < bits.length; i++) { - bits[i] = 0xffffffff; - } + this.set(0, UScript.CODE_LIMIT); } - @SuppressWarnings("unused") - public void resetAll() { - for (int i = 0; i < bits.length; i++) { - bits[i] = 0; - } + public boolean isFull() { + return cardinality() == UScript.CODE_LIMIT; } - public int countMembers() { - // This bit counter is good for sparse numbers of '1's, which is - // very much the case that we will usually have. - int count = 0; - for (int i = 0; i < bits.length; i++) { - int x = bits[i]; - while (x != 0) { - count++; - x &= (x - 1); // AND off the least significant one bit. - // Note - Java integer over/underflow behavior is well defined. - // 0x80000000 - 1 = 0x7fffffff + public void appendStringTo(StringBuilder sb) { + sb.append("{ "); + if (isEmpty()) { + sb.append("- "); + } else if (isFull()) { + sb.append("* "); + } else { + for (int script = 0; script < UScript.CODE_LIMIT; script++) { + if (get(script)) { + sb.append(UScript.getShortName(script)); + sb.append(" "); + } } } - return count; + sb.append("}"); } - private int[] bits = new int[6]; + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(""); + return sb.toString(); + } } } - diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index aff50ded99a..bfc42727953 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21fc240ab98201884ee8e49c44bab3ba7b71d7eba95c9c442d82db15cd4c68d4 -size 11788999 +oid sha256:c3615865e8068508cca380d3aa8f8079f051dfabd556f6cd5bafe0ae3f9de5d0 +size 11786200 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 4709edab781..5ec5a40a24c 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52eef4e7e50fdffa89d1246d8ddeb17e51146f7a586e451196080acdd76730e4 +oid sha256:aade3b2d8f0a6f46d0ee33eed27d0e682c1abc8b72f7a85676ffacfb5815e27a size 91127 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 9586bffc609..91fbba0b437 100755 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb2c11d6d6d76e7fd31f99773a3ab7b6c13e0ddc748f00d94fcc048544f3043d +oid sha256:690b23f3bd2ea163e801126ba8e2c65709eae387011ad3ec863d8b7bad4cd571 size 811715 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java index d486e0989b4..f42c8c5469c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java @@ -13,13 +13,9 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.text.ParseException; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Comparator; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Locale; -import java.util.Random; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -30,8 +26,6 @@ import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; import com.ibm.icu.dev.test.TestUtil.JavaVendor; import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.IdentifierInfo; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.SpoofChecker; import com.ibm.icu.text.SpoofChecker.CheckResult; @@ -88,7 +82,6 @@ public class SpoofCheckerTest extends TestFmwk { } String fileName; Reader confusables; - Reader confusablesWholeScript; try { SpoofChecker rsc = null; @@ -96,13 +89,7 @@ public class SpoofCheckerTest extends TestFmwk { fileName = "unicode/confusables.txt"; confusables = TestUtil.getDataReader(fileName, "UTF-8"); try { - fileName = "unicode/confusablesWholeScript.txt"; - confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8"); - try { - rsc = new SpoofChecker.Builder().setData(confusables, confusablesWholeScript).build(); - } finally { - confusablesWholeScript.close(); - } + rsc = new SpoofChecker.Builder().setData(confusables).build(); } finally { confusables.close(); } @@ -120,17 +107,13 @@ public class SpoofCheckerTest extends TestFmwk { // The checker we just built from source rules should be equivalent to the // default checker created from prebuilt rules baked into the ICU data. SpoofChecker defaultChecker = new SpoofChecker.Builder().build(); - assertTrue("Checker built from rules equals default", defaultChecker.equals(rsc)); + assertEquals("Checker built from rules equals default", defaultChecker, rsc); + assertEquals("Checker built from rules has same hash code as default", defaultChecker.hashCode(), rsc.hashCode()); SpoofChecker optionChecker = new SpoofChecker.Builder(). setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build(); assertFalse("", optionChecker.equals(rsc)); - // Stub source data to build into a test SpoofChecker - String stubWSConfusables = - "# Stub Whole Script Confusable data\n" + - "0561 ; Armn; Cyrl; L # (ա) ARMENIAN SMALL LETTER AYB\n"; - String stubConfusables = "# Stub confusables data\n" + "05AD ; 0596 ; MA # ( ֭ → ֖ ) HEBREW ACCENT DEHI → HEBREW ACCENT TIPEHA #\n"; @@ -143,7 +126,7 @@ public class SpoofCheckerTest extends TestFmwk { SpoofChecker testChecker1 = builder.build(); assertTrue("", testChecker1.equals(defaultChecker)); - builder.setData(new StringReader(stubConfusables), new StringReader(stubWSConfusables)); + builder.setData(new StringReader(stubConfusables)); builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE); builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE); SetallowedLocales = new HashSet(); @@ -190,14 +173,14 @@ public class SpoofCheckerTest extends TestFmwk { */ @Test public void TestGetSetAllowedChars() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); UnicodeSet us; UnicodeSet uset; uset = sc.getAllowedChars(); assertTrue("", uset.isFrozen()); - us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */ - sc = new SpoofChecker.Builder().setAllowedChars(us).build(); + us = new UnicodeSet(0x41, 0x5A); /* [A-Z] */ + sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(us).build(); assertEquals("", us, sc.getAllowedChars()); } @@ -232,7 +215,7 @@ public class SpoofCheckerTest extends TestFmwk { */ @Test public void TestAllowedLocales() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); Set allowedLocales = null; Set allowedJavaLocales = null; boolean checkResults; @@ -250,7 +233,7 @@ public class SpoofCheckerTest extends TestFmwk { allowedLocales = new HashSet(); allowedLocales.add(enloc); allowedLocales.add(ruloc); - sc = new SpoofChecker.Builder().setAllowedLocales(allowedLocales).build(); + sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); allowedLocales = sc.getAllowedLocales(); assertTrue("en in allowed locales", allowedLocales.contains(enloc)); assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc)); @@ -258,14 +241,10 @@ public class SpoofCheckerTest extends TestFmwk { Locale frlocJ = new Locale("fr"); allowedJavaLocales = new HashSet(); allowedJavaLocales.add(frlocJ); - sc = new SpoofChecker.Builder().setAllowedJavaLocales(allowedJavaLocales).build(); + sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedJavaLocales(allowedJavaLocales).build(); assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en"))); assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ)); - /* - * Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we - * don't want to see in this test. - */ sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build(); SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); @@ -291,7 +270,7 @@ public class SpoofCheckerTest extends TestFmwk { */ @Test public void TestAllowedChars() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).build(); UnicodeSet set; UnicodeSet tmpSet; boolean checkResults; @@ -301,48 +280,40 @@ public class SpoofCheckerTest extends TestFmwk { tmpSet = new UnicodeSet(0, 0x10ffff); assertEquals("", tmpSet, set); - /* Setting the allowed chars should enable the check. */ - sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build(); - /* Remove a character that is in our good Latin test identifier from the allowed chars set. */ tmpSet.remove(goodLatin.charAt(1)); - sc = new SpoofChecker.Builder().setAllowedChars(tmpSet).build(); + sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedChars(tmpSet).build(); /* Latin Identifier should now fail; other non-latin test cases should still be OK */ SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); checkResults = sc.failsChecks(goodLatin, result); assertTrue("", checkResults); - assertEquals("", SpoofChecker.CHAR_LIMIT | SpoofChecker.RESTRICTION_LEVEL, result.checks); - - checkResults = sc.failsChecks(goodGreek, result); - assertTrue("", checkResults); - assertEquals("", SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks); + assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks); } @Test public void TestCheck() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); boolean checkResults; result.position = 666; checkResults = sc.failsChecks(goodLatin, result); assertFalse("", checkResults); - assertEquals("", 0, result.position); + assertEquals("", 0, result.checks); checkResults = sc.failsChecks(goodCyrl, result); assertFalse("", checkResults); + assertEquals("", 0, result.checks); result.position = 666; checkResults = sc.failsChecks(scMixed, result); assertTrue("", checkResults); - assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks); - assertEquals("", 0, result.position); + assertEquals("", SpoofChecker.RESTRICTION_LEVEL, result.checks); result.position = 666; checkResults = sc.failsChecks(han_Hiragana, result); assertFalse("", checkResults); - assertEquals("", 0, result.position); assertEquals("", 0, result.checks); } @@ -351,18 +322,18 @@ public class SpoofCheckerTest extends TestFmwk { SpoofChecker sc = new SpoofChecker.Builder().build(); int checkResults; checkResults = sc.areConfusable(scLatin, scMixed); - assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults); + assertEquals("Latin/Mixed is not MIXED_SCRIPT_CONFUSABLE", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults); checkResults = sc.areConfusable(goodGreek, scLatin); - assertEquals("", 0, checkResults); + assertEquals("Greek/Latin is not unconfusable", 0, checkResults); checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b); - assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults); + assertEquals("Latin/Latin is not SINGLE_SCRIPT_CONFUSABLE", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults); } @Test public void TestGetSkeleton() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); String dest; dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a); assertEquals("", lll_Skel, dest); @@ -379,9 +350,8 @@ public class SpoofCheckerTest extends TestFmwk { */ @Test public void TestSpoofAPI() { - SpoofChecker sc = new SpoofChecker.Builder().build(); - String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts. - // If this test starts failing, consult confusablesWholeScript.txt + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build(); + String s = "xyz"; SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); result.position = 666; boolean checkResults = sc.failsChecks(s, result); @@ -475,13 +445,13 @@ public class SpoofCheckerTest extends TestFmwk { actual = sc.getSkeleton(type, uInput); Throwable t = new Throwable(); int lineNumberOfTest = t.getStackTrace()[1].getLineNumber(); - + assertEquals(testName + " test at line " + lineNumberOfTest + " : Expected (escaped): " + expected, uExpected, actual); } @Test public void TestAreConfusable() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. " + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "; String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. " @@ -489,9 +459,68 @@ public class SpoofCheckerTest extends TestFmwk { assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2)); } + @Test + public void TestConfusableFlagVariants() { + // The spoof checker should only return those tests that the user requested. This test makes sure that + // the checker doesn't return anything the user doesn't want. This test started passing in ICU 58. + + String latn = "desordenado"; + String cyrl = "ԁеѕогԁепаԁо"; + String mixed = "dеѕогdenаdo"; + + Object[][] tests = { + // string 1, string 2, checks for spoof checker, expected output + { latn, cyrl, + SpoofChecker.CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, + { latn, cyrl, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, + { latn, cyrl, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, + { latn, cyrl, + SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, + SpoofChecker.WHOLE_SCRIPT_CONFUSABLE }, + { latn, cyrl, + SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, + 0 }, + { latn, mixed, + SpoofChecker.CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, + { latn, mixed, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, + { latn, mixed, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, + SpoofChecker.MIXED_SCRIPT_CONFUSABLE }, + { latn, mixed, + SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, + 0 }, + { latn, latn, + SpoofChecker.CONFUSABLE, + SpoofChecker.SINGLE_SCRIPT_CONFUSABLE }, + }; + + for (Object[] test : tests) { + String s1 = (String) test[0]; + String s2 = (String) test[1]; + int checks = (Integer) test[2]; + int expectedResult = (Integer) test[3]; + + // Sanity check: expectedResult should be a subset of checks + assertEquals("Invalid test case", expectedResult & checks, expectedResult); + + SpoofChecker sc = new SpoofChecker.Builder().setChecks(checks).build(); + int actualResult = sc.areConfusable(s1, s2); + assertEquals("Comparing '" + s1 + "' and '" + s2 + "' with checks '" + checks + "'", + expectedResult, actualResult); + } + } + @Test public void TestInvisible() { - SpoofChecker sc = new SpoofChecker.Builder().build(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.INVISIBLE).build(); String s = Utility.unescape("abcd\\u0301ef"); SpoofChecker.CheckResult result = new SpoofChecker.CheckResult(); result.position = -42; @@ -522,30 +551,40 @@ public class SpoofCheckerTest extends TestFmwk { {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE}, {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE}, {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE}, + {"a♥", RestrictionLevel.UNRESTRICTIVE}, + {"a\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, + {"aー\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, + {"aー\u303cア", RestrictionLevel.HIGHLY_RESTRICTIVE}, + { "アaー\u303c", RestrictionLevel.HIGHLY_RESTRICTIVE}, + {"a1١", RestrictionLevel.MODERATELY_RESTRICTIVE}, + {"a1١۱", RestrictionLevel.MODERATELY_RESTRICTIVE}, + {"١ー\u303caア1१۱", RestrictionLevel.MINIMALLY_RESTRICTIVE}, + {"aアー\u303c1१١۱", RestrictionLevel.MINIMALLY_RESTRICTIVE}, }; - IdentifierInfo idInfo = new IdentifierInfo().setIdentifierProfile(SpoofChecker.RECOMMENDED); + + UnicodeSet allowedChars = new UnicodeSet(); + // Allowed Identifier Characters. In addition to the Recommended Set, + // allow u303c, which has an interesting script extension of Hani Hira Kana. + allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c); + CheckResult checkResult = new CheckResult(); for (Object[] test : tests) { String testString = (String) test[0]; RestrictionLevel expectedLevel = (RestrictionLevel) test[1]; - idInfo.setIdentifier(testString); - assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel()); for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) { SpoofChecker sc = new SpoofChecker.Builder() - .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this - .setAllowedChars(SpoofChecker.RECOMMENDED) - .setRestrictionLevel(levelSetInSpoofChecker) - .build(); + .setAllowedChars(allowedChars) + .setRestrictionLevel(levelSetInSpoofChecker) + .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this + .build(); boolean actualValue = sc.failsChecks(testString, checkResult); + assertEquals("Testing restriction level for '" + testString + "'", + expectedLevel, checkResult.restrictionLevel); // we want to fail if the text is (say) MODERATE and the testLevel is ASCII - boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); - boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, expectedFailure, actualValue); - if (!t) { // debugging - actualValue = sc.failsChecks(testString, checkResult); - // we want to fail if the text is (say) MODERATE and the testLevel is ASCII - expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString); - } + boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0; + assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, + expectedFailure, actualValue); } } } @@ -557,157 +596,41 @@ public class SpoofCheckerTest extends TestFmwk { {"१", "[०]"}, {"1१", "[0०]"}, {"١۱", "[٠۰]"}, + {"a♥", "[]"}, + {"a\u303c", "[]"}, + {"aー\u303c", "[]"}, + {"aー\u303cア", "[]"}, + { "アaー\u303c", "[]"}, + {"a1١", "[0٠]"}, + {"a1١۱", "[0٠۰]"}, + {"١ー\u303caア1१۱", "[0٠۰०]"}, + {"aアー\u303c1१١۱", "[0٠۰०]"}, }; - IdentifierInfo idInfo = new IdentifierInfo(); CheckResult checkResult = new CheckResult(); for (Object[] test : tests) { String testString = (String) test[0]; UnicodeSet expected = new UnicodeSet((String)test[1]); - idInfo.setIdentifier(testString); - assertEquals("", expected, idInfo.getNumerics()); SpoofChecker sc = new SpoofChecker.Builder() .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this .build(); boolean actualValue = sc.failsChecks(testString, checkResult); + assertEquals("", expected, checkResult.numerics); assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue); } } - @Test - public void TestIdentifierInfo() { -// contains(BitSet, BitSet) - BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL); - BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL); - assertTrue("", IdentifierInfo.contains(bitset12, bitset2)); - assertTrue("", IdentifierInfo.contains(bitset12, bitset12)); - assertTrue("", !IdentifierInfo.contains(bitset2, bitset12)); - - assertTrue("", IdentifierInfo.BITSET_COMPARATOR.compare( - IdentifierInfo.set(new BitSet(), UScript.ARABIC), - IdentifierInfo.set(new BitSet(), UScript.LATIN)) < 0); -// displayAlternates(Collection) -// displayScripts(BitSet) - String scriptString = IdentifierInfo.displayScripts(bitset12); - assertEquals("", "Hang Latn", scriptString); - Set alternates = new HashSet(Arrays.asList(bitset12, bitset2)); - String alternatesString = IdentifierInfo.displayAlternates(alternates); - assertEquals("", "Hang; Hang Latn", alternatesString); - -// parseAlternates(String) -// parseScripts(String) - assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString)); - assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString)); - - String[][] tests = { - // String, restriction-level, numerics, scripts, alternates, common-alternates - {"a♥", "UNRESTRICTIVE", "[]", "Latn", "", ""}, - {"a\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, - {"aー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"}, - {"aー\u303cア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, - { "アaー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, - {"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"}, - {"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""}, - {"١ー\u303caア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, - {"aアー\u303c1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, - }; - for (String[] test : tests) { - String testString = test[0]; - IdentifierInfo idInfo = new IdentifierInfo(); - UnicodeSet allowedChars = new UnicodeSet(); - // Allowed Identifier Characters. In addition to the Recommended Set, - // allow u303c, which has an interesting script extension of Hani Hira Kana. - allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c); - idInfo.setIdentifierProfile(allowedChars); - idInfo.setIdentifier(testString); - assertEquals("Identifier " + testString, testString, idInfo.getIdentifier()); - - RestrictionLevel restrictionLevel = RestrictionLevel.valueOf(test[1]); - assertEquals("RestrictionLevel " + testString, restrictionLevel, idInfo.getRestrictionLevel()); - - UnicodeSet numerics = new UnicodeSet(test[2]); - assertEquals("Numerics " + testString, numerics, idInfo.getNumerics()); - - BitSet scripts = IdentifierInfo.parseScripts(test[3]); - assertEquals("Scripts " + testString, scripts, idInfo.getScripts()); - - Set alternates2 = IdentifierInfo.parseAlternates(test[4]); - assertEquals("Alternates " + testString, alternates2, idInfo.getAlternates()); - - BitSet commonAlternates = IdentifierInfo.parseScripts(test[5]); - assertEquals("Common Alternates " + testString, commonAlternates, idInfo.getCommonAmongAlternates()); - } - -// TODO -// getIdentifierProfile() -// setIdentifierProfile(UnicodeSet) - } - @Test public void TestBug11635() { // The bug was an error in iterating through supplementary characters in IdentifierInfo. // The three supplemental chars in the string are "123" from the mathematical bold digit range. // Common script, Nd general category, and no other restrictions on allowed characters - // leaves "ABC123" as SINGLE_SCRIPT_RESTRICTIVE. + // leaves "ABC123" as SINGLE_SCRIPT_RESTRICTIVE. String identifier = Utility.unescape("ABC\\U0001D7CF\\U0001D7D0\\U0001D7D1"); - IdentifierInfo idInfo = new IdentifierInfo(); - idInfo.setIdentifier(identifier); - assertEquals("", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE, idInfo.getRestrictionLevel()); - } - - @Test - public void TestComparator() { - Random random = new Random(0); - for (int i = 0; i < 100; ++i) { - BitSet[] items = new BitSet[random.nextInt(5)+3]; - for (int j = 0; j < items.length; ++j) { - items[j] = new BitSet(); - int countInBitset = random.nextInt(5); - for (int k = 0; k < countInBitset; ++k) { - items[j].set(random.nextInt(10)); - } - } - checkComparator(IdentifierInfo.BITSET_COMPARATOR, items); - } - } - - // Dumb implementation for now - private void checkComparator(Comparator comparator, T... items) { - logln("Checking " + Arrays.asList(items)); - /* - * The relation is transitive: a < b and b < c implies a < c. We test here. - * The relation is trichotomous: exactly one of a < b, b < a and a = b is true. Guaranteed by comparator. - */ - for (int i = 0; i < items.length-2; ++i) { - T a = items[i]; - for (int j = i+1; j < items.length-1; ++j) { - T b = items[j]; - for (int k = j+1; k < items.length; ++k) { - T c = items[k]; - checkTransitivity(comparator, a, b, c); - checkTransitivity(comparator, a, c, b); - checkTransitivity(comparator, b, a, b); - checkTransitivity(comparator, b, c, a); - checkTransitivity(comparator, c, a, b); - checkTransitivity(comparator, c, b, a); - } - } - } - } - - private void checkTransitivity(Comparator comparator, T a, T b, T c) { - int ab = comparator.compare(a,b); - int bc = comparator.compare(b,c); - int ca = comparator.compare(c,a); - if (!assertFalse("Transitive: " + a + ", " + b + ", " + c, - ab < 0 && bc < 0 && ca <= 0)) { - // for debugging - comparator.compare(a,b); - comparator.compare(b,c); - comparator.compare(c,a); - assertFalse("Transitive: " + a + ", " + b + ", " + c, - ab < 0 && bc < 0 && ca <= 0); - } + CheckResult checkResult = new CheckResult(); + SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.RESTRICTION_LEVEL).build(); + sc.failsChecks(identifier, checkResult); + assertEquals("", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE, checkResult.restrictionLevel); } private String parseHex(String in) { @@ -760,7 +683,7 @@ public class SpoofCheckerTest extends TestFmwk { // This regular expression matches lines and splits the fields into capture groups. // Capture group 1: map from chars // 2: map to chars - // 3: table type, SL, ML, SA or MA + // 3: table type, SL, ML, SA or MA (deprecated) // 4: Comment Lines Only // 5: Error Lines Only Matcher parseLine = Pattern.compile( @@ -793,20 +716,8 @@ public class SpoofCheckerTest extends TestFmwk { String rawExpected = parseHex(parseLine.group(2)); String expected = normalizer.normalize(rawExpected); - int skeletonType = 0; - String tableType = parseLine.group(3); - if (tableType.equals("SL")) { - skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE; - } else if (tableType.indexOf("SA") >= 0) { - skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE; - } else if (tableType.indexOf("ML") >= 0) { - skeletonType = 0; - } else if (tableType.indexOf("MA") >= 0) { - skeletonType = SpoofChecker.ANY_CASE; - } - String actual; - actual = sc.getSkeleton(skeletonType, from); + actual = sc.getSkeleton(from); if (!actual.equals(expected)) { errln("confusables.txt: " + lineNum + ": " + parseLine.group(0)); @@ -823,10 +734,45 @@ public class SpoofCheckerTest extends TestFmwk { public void TestCheckResultToString11447() { CheckResult checkResult = new CheckResult(); SpoofChecker sc = new SpoofChecker.Builder() - .setChecks(-1) + .setChecks(SpoofChecker.MIXED_NUMBERS) .build(); sc.failsChecks("1१", checkResult); assertTrue("CheckResult: ", checkResult.toString().contains("MIXED_NUMBERS")); } + @Test + public void TestDeprecated() { + // getSkeleton + SpoofChecker sc = new SpoofChecker.Builder().build(); + assertEquals("Deprecated version of getSkeleton method does not work", + sc.getSkeleton(SpoofChecker.ANY_CASE, scMixed), + sc.getSkeleton(scMixed)); + + // setData + try { + String fileName1 = "unicode/confusables.txt"; + String fileName2 = "unicode/confusablesWholeScript.txt"; + Reader reader1 = TestUtil.getDataReader(fileName1, "UTF-8"); + Reader reader2 = TestUtil.getDataReader(fileName2, "UTF-8"); + Reader reader3 = TestUtil.getDataReader(fileName1, "UTF-8"); + try { + SpoofChecker sc2 = new SpoofChecker.Builder() + .setData(reader1, reader2) + .build(); + SpoofChecker sc1 = new SpoofChecker.Builder() + .setData(reader3) + .build(); + assertEquals("Deprecated version of setData method does not work", sc1, sc2); + } finally { + reader1.close(); + reader2.close(); + reader3.close(); + } + } catch(IOException e) { + fail("Could not load confusables data"); + } catch (ParseException e) { + fail("Could not parse confusables data"); + } + } + } -- 2.40.0