ICU-12526 Unicode 9 data 2016-jun-09

author Markus Scherer <markus.icu@gmail.com>

Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 474dfd154a9f09430b8589d17841796ecc599ba7..fd0889cce53da712e519dff62cd84ba2506b2f6c 100755 (executable)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:2bd9654a895ed4a0aff9e9145040495dc7994732d50a1907e7a89c4caf01eedd
-size 11733166
+oid sha256:b882f018fe19d1d295817d91b5a99f1eb7a01d08e538adf83a6253e981ce95f5
+size 11732712
diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt

index 60c6721b22286ff89b1544e97635cd5e356ecf58..17bb9e58a19992d43901abef6f7b6b9a2243f226 100644 (file)
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt
@@ -1,5 +1,5 @@
  # CollationTest_CLDR_NON_IGNORABLE_SHORT.txt
-# Date: 2016-05-06, 18:35:32 GMT
+# Date: 2016-06-03, 18:31:07 GMT
  # © 2016 Unicode®, Inc.
  # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
  # For terms of use, see http://www.unicode.org/terms_of_use.html
diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt

index 832434b0587b7f614a5d8b892cf4bc60b300e514..e4f11ce67088e61dc2c268ae3b56b719668ac14a 100644 (file)
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt
@@ -1,5 +1,5 @@
  # CollationTest_CLDR_SHIFTED_SHORT.txt
-# Date: 2016-05-06, 18:35:34 GMT
+# Date: 2016-06-03, 18:31:11 GMT
  # © 2016 Unicode®, Inc.
  # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
  # For terms of use, see http://www.unicode.org/terms_of_use.html
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt

index caf087b47d652421e9727a24c9122bbc00b77b47..a756976461bf93b672723a4dc68ee7a709377d29 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt
@@ -20179,10 +20179,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
  11C34;BHAIKSUKI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
  11C35;BHAIKSUKI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
  11C36;BHAIKSUKI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
-11C38;BHAIKSUKI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
-11C39;BHAIKSUKI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
-11C3A;BHAIKSUKI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
-11C3B;BHAIKSUKI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;
+11C38;BHAIKSUKI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
+11C39;BHAIKSUKI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
+11C3A;BHAIKSUKI VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;
+11C3B;BHAIKSUKI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
  11C3C;BHAIKSUKI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
  11C3D;BHAIKSUKI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
  11C3E;BHAIKSUKI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index 537d1dc650e0b9e22e379756d0822bfa4502f667..82eb380c5ab7691f4b462c8d4bf27602e66f0fbc 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -159,6 +159,7 @@ public class RBBITestMonkey extends TestFmwk {
              int     breakPos = -1;
  
              int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
+            int   cBase;              // for (X Extend*) patterns, the X character.
  
              // Previous break at end of string.  return DONE.
              if (prevPos >= fText.length()) {
@@ -166,7 +167,7 @@ public class RBBITestMonkey extends TestFmwk {
              }
              /* p0 = */ p1 = p2 = p3 = prevPos;
              c3 =  UTF16.charAt(fText, prevPos);
-            c0 = c1 = c2 = 0;
+            c0 = c1 = c2 = cBase = 0;
  
              // Loop runs once per "significant" character position in the input text.
              for (;;) {
@@ -233,22 +234,11 @@ public class RBBITestMonkey extends TestFmwk {
                      continue;
                  }
  
-                // Rule (GB8a)   Regional_Indicator x Regional_Indicator
-                //                Note: The first if condition is a little tricky. We only need to force
-                //                      a break if there are three or more contiguous RIs. If there are
-                //                      only two, a break following will occur via other rules, and will include
-                //                      any trailing extend characters, which is needed behavior.
-                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
-                        && fRegionalIndicatorSet.contains(c2)) {
-                    break;
-                }
-
-                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
-                    continue;
-                }
-
                  // Rule (GB9)    x (Extend | ZWJ)
                  if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
+                    if (!fExtendSet.contains(c1)) {
+                        cBase = c1;
+                    }
                      continue;
                  }
  
@@ -261,16 +251,33 @@ public class RBBITestMonkey extends TestFmwk {
                  if (fPrependSet.contains(c1)) {
                      continue;
                  }
-                // Rule (GB10)   (Emoji_Base | EBG) x Emoji_Modifier
+                // Rule (GB10)   (Emoji_Base | EBG) Extend* x Emoji_Modifier
                  if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
                      continue;
                  }
+                if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
+                        fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
+                    continue;
+                }
  
                  // Rule (GB11)   ZWJ x (Glue_After_Zwj | EBG)
                  if (fZWJSet.contains(c1) && (fGAZSet.contains(c2) || fEBGSet.contains(c2))) {
                      continue;
                  }
  
+                // Rule (GB12-13)   Regional_Indicator x Regional_Indicator
+                //                  Note: The first if condition is a little tricky. We only need to force
+                //                      a break if there are three or more contiguous RIs. If there are
+                //                      only two, a break following will occur via other rules, and will include
+                //                      any trailing extend characters, which is needed behavior.
+                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
+                        && fRegionalIndicatorSet.contains(c2)) {
+                    break;
+                }
+                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+                    continue;
+                }
+
                  // Rule (GB999)  Any  <break>  Any
                  break;
              }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java

index 6b600cb31927b4a248d1a24e9ca092e57e8b0844..b213ab9c66b5f33c63934e1cf5d7276e31847c65 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java
@@ -600,19 +600,23 @@ public class SpoofCheckerTest extends TestFmwk {
          String[][] tests = {
                  // String, restriction-level, numerics, scripts, alternates, common-alternates
                  {"a♥",  "UNRESTRICTIVE", "[]", "Latn", "", ""},
-                {"a〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
-                {"aー〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"},
-                {"aー〆ア",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
-                {"アaー〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
+                {"a\u303c",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
+                {"aー\u303c",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"},
+                {"aー\u303cア",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
+                { "アaー\u303c",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
                  {"a1١",  "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"},
                  {"a1١۱",  "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""},
-                {"١ー〆aア1१۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
-                {"aアー〆1१١۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
+                {"١ー\u303caア1१۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
+                {"aアー\u303c1१١۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
          };
          for (String[] test : tests) {
              String testString = test[0];
              IdentifierInfo idInfo = new IdentifierInfo();
-            idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED);
+            UnicodeSet allowedChars = new UnicodeSet();
+            // Allowed Identifier Characters. In addition to the Recommended Set,
+            //    allow u303c, which has an interesting script extension of Hani Hira Kana. 
+            allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c);
+            idInfo.setIdentifierProfile(allowedChars);
              idInfo.setIdentifier(testString);
              assertEquals("Identifier " + testString, testString, idInfo.getIdentifier());
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 10 Jun 2016 00:22:34 +0000 (00:22 +0000)
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt		patch \| blob \| history
icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/UnicodeData.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/text/SpoofCheckerTest.java		patch \| blob \| history