ICU-10176 No line break in $SY $HL; update tests, including old missing updates for...

author Peter Edberg <pedberg@unicode.org>

Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)

committer Peter Edberg <pedberg@unicode.org>

Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)
author Peter Edberg <pedberg@unicode.org>
Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)
committer Peter Edberg <pedberg@unicode.org>
Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 87eb3b7ec3ca73a1459443b188067ab1f6101859..8c6f95b248df1fca37d17ef062a1488d1ac65842 100755 (executable)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:ec461f02d26e5167a56539e4bf2aabb81c038cec0498eb2bbbc44378977b719b
-size 10966492
+oid sha256:0281eb436d3f76c50252cc66bbe357ba00aeb6db06839224cfefa18d386f3338
+size 10966706
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar

index 18a33e3b107c5cf48ba8054988d86820c3136c69..099c6e2cb1257d0bee4bd03697843bf5c31ba9c3 100755 (executable)
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:d08aec73aa20b1669db6e144039aae4bd6357e39e8445986c02be057c169e7f1
+oid sha256:45587463e8dcef07be8a580b224bfbc9ccafd9e9f14d435f401c7b08294c36d9
  size 725607
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index b4ab8b353e5ba4c8bbc85341d6f7e1b64921609a..25e2d067890c443c0724619f6f602767b382aa06 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -557,6 +557,7 @@ public class RBBITestMonkey extends TestFmwk {
          UnicodeSet  fSY;
          UnicodeSet  fAI;
          UnicodeSet  fAL;
+        UnicodeSet  fHL;
          UnicodeSet  fID;
          UnicodeSet  fSA;
          UnicodeSet  fJL;
@@ -605,6 +606,7 @@ public class RBBITestMonkey extends TestFmwk {
              fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
              fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
              fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
+            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
              fID    = new UnicodeSet("[\\p{Line_break=ID}]");
              fSA    = new UnicodeSet("[\\p{Line_break=SA}]");
              fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
@@ -657,6 +659,7 @@ public class RBBITestMonkey extends TestFmwk {
              fSets.add(fSY);
              fSets.add(fAI);
              fSets.add(fAL);
+            fSets.add(fHL);
              fSets.add(fID);
              fSets.add(fWJ);
              fSets.add(fSA);
@@ -679,6 +682,7 @@ public class RBBITestMonkey extends TestFmwk {
              int    prevChar;  //  Character at above position.  Note that prevChar
                                //   and thisChar may not be adjacent because combining
                                //   characters between them will be ignored.
+            int    prevCharX2; //  Character before prevChar, more contex for LB 21a
              
              int    nextPos;   //  Index of the next character following pos.
                                //     Usually skips over combining marks.
@@ -695,7 +699,7 @@ public class RBBITestMonkey extends TestFmwk {
              //                           while the invalid values shift out and the "this" and
              //                           "prev" positions are filled in with good values.
              pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
-            thisChar = prevChar  = 0;
+            thisChar = prevChar  = prevCharX2 = 0;
              nextPos  = startPos;
              
              
@@ -706,6 +710,7 @@ public class RBBITestMonkey extends TestFmwk {
              //  "prevPos" can be arbitrarily far before "pos".
              for (;;) {
                  // Advance to the next position to be tested.
+                prevCharX2 = prevChar;
                  prevPos   = pos;
                  prevChar  = thisChar;
                  pos       = nextPos;
@@ -920,8 +925,19 @@ public class RBBITestMonkey extends TestFmwk {
                      continue;
                  }
                  
-                // LB 22
+                 // LB 21a, HL (HY | BA) x
+                if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
+                    continue;
+                }
+
+                 // LB 21b, SY x HL
+                if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
+                    continue;
+                }
+                
+               // LB 22
                  if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
+                        fHL.contains(prevChar) && fIN.contains(thisChar) ||
                          fID.contains(prevChar) && fIN.contains(thisChar) ||
                          fIN.contains(prevChar) && fIN.contains(thisChar) ||
                          fNU.contains(prevChar) && fIN.contains(thisChar) )   {
@@ -934,8 +950,10 @@ public class RBBITestMonkey extends TestFmwk {
                  //          NU x AL
                  if (fID.contains(prevChar) && fPO.contains(thisChar) ||
                          fAL.contains(prevChar) && fNU.contains(thisChar) ||
-                        fNU.contains(prevChar) && fAL.contains(thisChar) )   {
-                    continue;
+                        fHL.contains(prevChar) && fNU.contains(thisChar) ||
+                        fNU.contains(prevChar) && fAL.contains(thisChar) ||
+                        fNU.contains(prevChar) && fHL.contains(thisChar) )   {
+                   continue;
                  }
                  
                  // LB 24  Do not break between prefix and letters or ideographs.
@@ -943,8 +961,8 @@ public class RBBITestMonkey extends TestFmwk {
                  //        PR x AL
                  //        PO x AL
                  if (fPR.contains(prevChar) && fID.contains(thisChar) ||
-                    fPR.contains(prevChar) && fAL.contains(thisChar) ||
-                    fPO.contains(prevChar) && fAL.contains(thisChar))  {
+                    fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
+                    fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)))  {
                      continue;
                  }
                  
@@ -1011,22 +1029,22 @@ public class RBBITestMonkey extends TestFmwk {
                  
                  
                  // LB 28 Do not break between alphabetics
-                if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
+                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
                      continue;
                  }
                  
                  // LB 29  Do not break between numeric punctuation and alphabetics
-                if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
+                if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
                      continue;
                  }
                  
                  // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
                  //          (AL | NU) x OP
                  //          CP x (AL | NU)
-                if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
+                if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
                      continue;
                  }
-                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
+                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
                      continue;
                  }
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 0b8d0758e82dc21303ba47cea93d5806009cb0b9..e53071249454c590aee82fb43ce1e7dbe9f8b851 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -550,7 +550,14 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
  <data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
  <data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
  <data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
- 
+  
+# Test for #10176 (in root)
+<line>
+<data>•abc/•s •def•</data>
+<data>•abc/\u05D9 •def•</data>
+<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
+<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
+
  
  ########################################################################################
  #
@@ -696,6 +703,13 @@ Bangkok)•</data>
  <word>
  <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
  
+# Test for #10176 (in ja)
+<line>
+<data>•abc/•s •def•</data>
+<data>•abc/\u05D9 •def•</data>
+<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
+<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
+
  <locale root>
  <word>
  <data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。•</data>
@@ -772,3 +786,10 @@ Bangkok)•</data>
  
  <data>•abc •- •def    •abc •-def    •abc- •def   •</data>   # With ASCII hyphen
  <data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •</data>   # With Unicode u2010 hyphen
+
+# Test for #10176 (in fi)
+<line>
+<data>•abc/•s •def•</data>
+<data>•abc/\u05D9 •def•</data>
+<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
+<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
author	Peter Edberg <pedberg@unicode.org>
	Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)
committer	Peter Edberg <pedberg@unicode.org>
	Fri, 30 Aug 2013 06:39:01 +0000 (06:39 +0000)
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/testdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history