From e876c898421a1915622cdcd33e4aa9bdcacebe9a Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 10 Jun 2016 00:22:34 +0000 Subject: [PATCH] ICU-12526 Unicode 9 data 2016-jun-09 X-SVN-Rev: 38821 --- icu4j/main/shared/data/icudata.jar | 4 +- .../CollationTest_NON_IGNORABLE_SHORT.txt | 2 +- .../dev/data/CollationTest_SHIFTED_SHORT.txt | 2 +- .../ibm/icu/dev/data/unicode/UnicodeData.txt | 8 ++-- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 39 +++++++++++-------- .../icu/dev/test/text/SpoofCheckerTest.java | 18 +++++---- 6 files changed, 42 insertions(+), 31 deletions(-) diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 474dfd154a9..fd0889cce53 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bd9654a895ed4a0aff9e9145040495dc7994732d50a1907e7a89c4caf01eedd -size 11733166 +oid sha256:b882f018fe19d1d295817d91b5a99f1eb7a01d08e538adf83a6253e981ce95f5 +size 11732712 diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt index 60c6721b222..17bb9e58a19 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt @@ -1,5 +1,5 @@ # CollationTest_CLDR_NON_IGNORABLE_SHORT.txt -# Date: 2016-05-06, 18:35:32 GMT +# Date: 2016-06-03, 18:31:07 GMT # © 2016 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt index 832434b0587..e4f11ce6708 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt @@ -1,5 +1,5 @@ # CollationTest_CLDR_SHIFTED_SHORT.txt -# Date: 2016-05-06, 18:35:34 GMT +# Date: 2016-06-03, 18:31:11 GMT # © 2016 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt index caf087b47d6..a756976461b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt @@ -20179,10 +20179,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11C34;BHAIKSUKI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;; 11C35;BHAIKSUKI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;; 11C36;BHAIKSUKI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;; -11C38;BHAIKSUKI VOWEL SIGN E;Mc;0;L;;;;;N;;;;; -11C39;BHAIKSUKI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;; -11C3A;BHAIKSUKI VOWEL SIGN O;Mc;0;L;;;;;N;;;;; -11C3B;BHAIKSUKI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;; +11C38;BHAIKSUKI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;; +11C39;BHAIKSUKI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;; +11C3A;BHAIKSUKI VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;; +11C3B;BHAIKSUKI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;; 11C3C;BHAIKSUKI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 11C3D;BHAIKSUKI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; 11C3E;BHAIKSUKI SIGN VISARGA;Mc;0;L;;;;;N;;;;; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 537d1dc650e..82eb380c5ab 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -159,6 +159,7 @@ public class RBBITestMonkey extends TestFmwk { int breakPos = -1; int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. + int cBase; // for (X Extend*) patterns, the X character. // Previous break at end of string. return DONE. if (prevPos >= fText.length()) { @@ -166,7 +167,7 @@ public class RBBITestMonkey extends TestFmwk { } /* p0 = */ p1 = p2 = p3 = prevPos; c3 = UTF16.charAt(fText, prevPos); - c0 = c1 = c2 = 0; + c0 = c1 = c2 = cBase = 0; // Loop runs once per "significant" character position in the input text. for (;;) { @@ -233,22 +234,11 @@ public class RBBITestMonkey extends TestFmwk { continue; } - // Rule (GB8a) Regional_Indicator x Regional_Indicator - // Note: The first if condition is a little tricky. We only need to force - // a break if there are three or more contiguous RIs. If there are - // only two, a break following will occur via other rules, and will include - // any trailing extend characters, which is needed behavior. - if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) - && fRegionalIndicatorSet.contains(c2)) { - break; - } - - if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { - continue; - } - // Rule (GB9) x (Extend | ZWJ) if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { + if (!fExtendSet.contains(c1)) { + cBase = c1; + } continue; } @@ -261,16 +251,33 @@ public class RBBITestMonkey extends TestFmwk { if (fPrependSet.contains(c1)) { continue; } - // Rule (GB10) (Emoji_Base | EBG) x Emoji_Modifier + // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { continue; } + if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) && + fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) { + continue; + } // Rule (GB11) ZWJ x (Glue_After_Zwj | EBG) if (fZWJSet.contains(c1) && (fGAZSet.contains(c2) || fEBGSet.contains(c2))) { continue; } + // Rule (GB12-13) Regional_Indicator x Regional_Indicator + // Note: The first if condition is a little tricky. We only need to force + // a break if there are three or more contiguous RIs. If there are + // only two, a break following will occur via other rules, and will include + // any trailing extend characters, which is needed behavior. + if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) + && fRegionalIndicatorSet.contains(c2)) { + break; + } + if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { + continue; + } + // Rule (GB999) Any Any break; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java index 6b600cb3192..b213ab9c66b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java @@ -600,19 +600,23 @@ public class SpoofCheckerTest extends TestFmwk { String[][] tests = { // String, restriction-level, numerics, scripts, alternates, common-alternates {"a♥", "UNRESTRICTIVE", "[]", "Latn", "", ""}, - {"a〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, - {"aー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"}, - {"aー〆ア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, - {"アaー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, + {"a\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"}, + {"aー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"}, + {"aー\u303cア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, + { "アaー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""}, {"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"}, {"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""}, - {"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, - {"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, + {"١ー\u303caア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, + {"aアー\u303c1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"}, }; for (String[] test : tests) { String testString = test[0]; IdentifierInfo idInfo = new IdentifierInfo(); - idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED); + UnicodeSet allowedChars = new UnicodeSet(); + // Allowed Identifier Characters. In addition to the Recommended Set, + // allow u303c, which has an interesting script extension of Hani Hira Kana. + allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c); + idInfo.setIdentifierProfile(allowedChars); idInfo.setIdentifier(testString); assertEquals("Identifier " + testString, testString, idInfo.getIdentifier()); -- 2.40.0