ICU-13770 RBBI LB8a rule update for ICU4J.

author Andy Heninger <andy.heninger@gmail.com>

Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 3fec4b557450ec32496bb3f4ff2a5273366f7f4f..5ef426b64beecc43b7a421ac9847cedd22023575 100755 (executable)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:4bd87b532fc7ad362740dde413999961c7f372cbbce5fb54160d201b783fec33
-size 12503004
+oid sha256:e9ffd3c1d1fa55ec8819eee15483f7f1b4c4520a62a9ae3d0b3f971b9d06e18c
+size 12500142
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar

index cbe7c569f6591fd26a47447bfb982607049dcde1..fdf0df049f3be209968b085aea2fbcec51c97aa5 100755 (executable)
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:5c25f29e8e9f5b7244a63ddc48dd7d69f56612b310e1de8351c9ea80a84afc6f
+oid sha256:88f00fc2ffbd0fcae8531cffdcc5b405876d3d89036e5cb8e077b0e817b88d9f
  size 92867
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index c986f04df9e915fa279f59ebdd0764a4757a8d72..333da86099c1a6fb7fce31d273584f02ff5f0fd8 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -646,8 +646,6 @@ public class RBBITestMonkey extends TestFmwk {
          UnicodeSet  fEB;
          UnicodeSet  fEM;
          UnicodeSet  fZWJ;
-        UnicodeSet  fExtendedPict;
-        UnicodeSet  fEmojiNRK;
  
          StringBuffer  fText;
          int           fOrigPositions;
@@ -701,9 +699,6 @@ public class RBBITestMonkey extends TestFmwk {
              fEB    = new UnicodeSet("[\\p{Line_break=EB}]");
              fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
              fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
-            fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
-            fExtendedPict = new UnicodeSet("[:Extended_Pictographic:]");
-
  
              // Remove dictionary characters.
              // The monkey test reference implementation of line break does not replicate the dictionary behavior,
@@ -760,8 +755,6 @@ public class RBBITestMonkey extends TestFmwk {
              fSets.add(fEB);
              fSets.add(fEM);
              fSets.add(fZWJ);
-            fSets.add(fExtendedPict);
-            fSets.add(fEmojiNRK);
          }
  
          @Override
@@ -897,13 +890,39 @@ public class RBBITestMonkey extends TestFmwk {
                      break;
                  }
  
+                // LB 25    Numbers
+                //          Move this test up, before LB8a, because numbers can match a longer sequence that would
+                //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
+                matchVals = LBNumberCheck(fText, prevPos, matchVals);
+                if (matchVals[0] != -1) {
+                    // Matched a number.  But could have been just a single digit, which would
+                    //    not represent a "no break here" between prevChar and thisChar
+                    int numEndIdx = matchVals[1];  // idx of first char following num
+                    if (numEndIdx > pos) {
+                        // Number match includes at least the two chars being checked
+                        if (numEndIdx > nextPos) {
+                            // Number match includes additional chars.  Update pos and nextPos
+                            //   so that next loop iteration will continue at the end of the number,
+                            //   checking for breaks between last char in number & whatever follows.
+                            nextPos = numEndIdx;
+                            pos     = numEndIdx;
+                            do {
+                                pos = moveIndex32(fText, pos, -1);
+                                thisChar = UTF16.charAt(fText, pos);
+                            }
+                            while (fCM.contains(thisChar));
+                        }
+                        continue;
+                    }
+                }
+
                  // LB 8a:  ZWJ x (ID | Extended_Pictographic | Emoji)
                  //       The monkey test's way of ignoring combining characters doesn't work
                  //       for this rule. ZWJ is also a CM. Need to get the actual character
                  //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
                  {
                      int prevC = fText.codePointBefore(pos);
-                    if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
+                    if (fZWJ.contains(prevC)) {
                          continue;
                      }
                  }
@@ -1088,31 +1107,7 @@ public class RBBITestMonkey extends TestFmwk {
                      continue;
                  }
  
-
-                // LB 25    Numbers
-                matchVals = LBNumberCheck(fText, prevPos, matchVals);
-                if (matchVals[0] != -1) {
-                    // Matched a number.  But could have been just a single digit, which would
-                    //    not represent a "no break here" between prevChar and thisChar
-                    int numEndIdx = matchVals[1];  // idx of first char following num
-                    if (numEndIdx > pos) {
-                        // Number match includes at least the two chars being checked
-                        if (numEndIdx > nextPos) {
-                            // Number match includes additional chars.  Update pos and nextPos
-                            //   so that next loop iteration will continue at the end of the number,
-                            //   checking for breaks between last char in number & whatever follows.
-                            nextPos = numEndIdx;
-                            pos     = numEndIdx;
-                            do {
-                                pos = moveIndex32(fText, pos, -1);
-                                thisChar = UTF16.charAt(fText, pos);
-                            }
-                            while (fCM.contains(thisChar));
-                        }
-                        continue;
-                    }
-                }
-
+                // LB 25  Numbers  match, moved up, before LB 8a.
  
                  // LB 26  Do not break Korean Syllables
                  if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt

index 7a244bcea1266f01bafa9958a8643aaba2467e19..b478bf9b8e4fcee37fb70a4af0af18281693ce64 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@@ -5,7 +5,8 @@
  
  # file: line.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -24,7 +25,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CM_ = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -59,16 +60,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -97,8 +95,10 @@ LB7.2:      [ZW SP] [SP ZW];
  LB8:        ZW ÷;
  
  # LB8a
-#      ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+#      ZWJ x
+#      Don't match a CM on the right - let other rules pick up CM sequences, where
+#      the ZWJ behaves as just another generic CM.
+LB8a:       ZWJ [^CM];
  
  
  # LB9:  X CM -> X
@@ -107,7 +107,7 @@ LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
  #LB11:       × WJ;
  #            WJ ×
  
-LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1:      [^SP] CM* WJ;
  LB11.2:      SP WJ;
  LB11.3:      WJ CM* [^CM];
  
@@ -133,12 +133,14 @@ LB19:        . CM* QU;
  LB19.1:      QU CM* [^CM];
  
  # LB 20   Break before and after CB.
-#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#         Interaction with LB8a:  ZWJ x . is tricky because CM includes ZWJ.
  #                                 ZWJ acts like a CM to the left, combining with CB.
-#                                 ZWJ acts independently to the right, no break from ID by LB8a.
-LB20:        . CM* ÷ CB;
-LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b:      CB CM* ÷;
+#                                 ZWJ acts independently to the right, no break after by LB8a.
+LB20.1:      . CM* ZWJ CB;
+LB20.2:      . CM* ÷ CB;
+
+LB20.3:      CB CM* ZWJ [^CM];
+LB20.4:      CB CM* ÷;
  
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
@@ -185,15 +187,15 @@ LB29:        IS CM* (AL | HL);
  LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
-# LB31  keep pairs of RI together.
-LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3:     RI CM* RI CM* ÷;
+# LB30a  keep pairs of RI together.
+LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;
  
  # LB31 Break Everywhere Else.
  #      Include combining marks
-LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1:        . CM* ZWJ [^CM];
  LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt

index 45de73b5eb99718a0452060becfc74cbea07f1e3..1ef43f9fa2761b64e4155889d4255a3c21f09995 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@@ -5,7 +5,8 @@
  #
  #  file:  line_loose.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -31,7 +32,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CM_ = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -67,16 +68,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -105,8 +103,10 @@ LB7.2:      [ZW SP] [SP ZW];
  LB8:        ZW ÷;
  
  # LB8a
-#      ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+#      ZWJ x
+#      Don't match a CM on the right - let other rules pick up CM sequences, where
+#      the ZWJ behaves as just another generic CM.
+LB8a:       ZWJ [^CM];
  
  
  # LB9:  X CM -> X
@@ -115,7 +115,7 @@ LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
  #LB11:       × WJ;
  #            WJ ×
  
-LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1:      [^SP] CM* WJ;
  LB11.2:      SP WJ;
  LB11.3:      WJ CM* [^CM];
  
@@ -141,12 +141,14 @@ LB19:        . CM* QU;
  LB19.1:      QU CM* [^CM];
  
  # LB 20   Break before and after CB.
-#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#         Interaction with LB8a:  ZWJ x . is tricky because CM includes ZWJ.
  #                                 ZWJ acts like a CM to the left, combining with CB.
-#                                 ZWJ acts independently to the right, no break from ID by LB8a.
-LB20:        . CM* ÷ CB;
-LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b:      CB CM* ÷;
+#                                 ZWJ acts independently to the right, no break after by LB8a.
+LB20.1:      . CM* ZWJ CB;
+LB20.2:      . CM* ÷ CB;
+
+LB20.3:      CB CM* ZWJ [^CM];
+LB20.4:      CB CM* ÷;
  
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
@@ -193,15 +195,15 @@ LB29:        IS CM* (AL | HL);
  LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
-# LB31  keep pairs of RI together.
-LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3:     RI CM* RI CM* ÷;
+# LB30a  keep pairs of RI together.
+LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;
  
  # LB31 Break Everywhere Else.
  #      Include combining marks
-LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1:        . CM* ZWJ [^CM];
  LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt

index 63a244e22cdd0040e60fc1008fabfc6343d9f613..4227de8d3bb3be62f50338861a15da816cc0ba74 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -5,7 +5,8 @@
  #
  #  file:  line_loose_cj.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -45,7 +46,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CM_ = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -84,16 +85,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -122,8 +120,10 @@ LB7.2:      [ZW SP] [SP ZW];
  LB8:        ZW ÷;
  
  # LB8a
-#      ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+#      ZWJ x
+#      Don't match a CM on the right - let other rules pick up CM sequences, where
+#      the ZWJ behaves as just another generic CM.
+LB8a:       ZWJ [^CM];
  
  
  # LB9:  X CM -> X
@@ -132,7 +132,7 @@ LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
  #LB11:       × WJ;
  #            WJ ×
  
-LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1:      [^SP] CM* WJ;
  LB11.2:      SP WJ;
  LB11.3:      WJ CM* [^CM];
  
@@ -158,12 +158,14 @@ LB19:        . CM* QU;
  LB19.1:      QU CM* [^CM];
  
  # LB 20   Break before and after CB.
-#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#         Interaction with LB8a:  ZWJ x . is tricky because CM includes ZWJ.
  #                                 ZWJ acts like a CM to the left, combining with CB.
-#                                 ZWJ acts independently to the right, no break from ID by LB8a.
-LB20:        . CM* ÷ CB;
-LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b:      CB CM* ÷;
+#                                 ZWJ acts independently to the right, no break after by LB8a.
+LB20.1:      . CM* ZWJ CB;
+LB20.2:      . CM* ÷ CB;
+
+LB20.3:      CB CM* ZWJ [^CM];
+LB20.4:      CB CM* ÷;
  
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
@@ -214,15 +216,15 @@ LB29:        IS CM* (AL | HL);
  LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
-# LB31  keep pairs of RI together.
-LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3:     RI CM* RI CM* ÷;
+# LB30a  keep pairs of RI together.
+LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;
  
  # LB31 Break Everywhere Else.
  #      Include combining marks
-LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1:        . CM* ZWJ [^CM];
  LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt

index 3fa5e235ba01e398c26134099a73503ccaba9c37..5952d5bc4498f770b9b9b549ed55c2e11d003536 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@@ -5,7 +5,8 @@
  #
  # file: line_normal.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -38,7 +39,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CM_ = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -73,16 +74,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -111,8 +109,10 @@ LB7.2:      [ZW SP] [SP ZW];
  LB8:        ZW ÷;
  
  # LB8a
-#      ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+#      ZWJ x
+#      Don't match a CM on the right - let other rules pick up CM sequences, where
+#      the ZWJ behaves as just another generic CM.
+LB8a:       ZWJ [^CM];
  
  
  # LB9:  X CM -> X
@@ -121,7 +121,7 @@ LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
  #LB11:       × WJ;
  #            WJ ×
  
-LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1:      [^SP] CM* WJ;
  LB11.2:      SP WJ;
  LB11.3:      WJ CM* [^CM];
  
@@ -147,12 +147,14 @@ LB19:        . CM* QU;
  LB19.1:      QU CM* [^CM];
  
  # LB 20   Break before and after CB.
-#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#         Interaction with LB8a:  ZWJ x . is tricky because CM includes ZWJ.
  #                                 ZWJ acts like a CM to the left, combining with CB.
-#                                 ZWJ acts independently to the right, no break from ID by LB8a.
-LB20:        . CM* ÷ CB;
-LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b:      CB CM* ÷;
+#                                 ZWJ acts independently to the right, no break after by LB8a.
+LB20.1:      . CM* ZWJ CB;
+LB20.2:      . CM* ÷ CB;
+
+LB20.3:      CB CM* ZWJ [^CM];
+LB20.4:      CB CM* ÷;
  
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
@@ -199,15 +201,15 @@ LB29:        IS CM* (AL | HL);
  LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
-# LB31  keep pairs of RI together.
-LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3:     RI CM* RI CM* ÷;
+# LB30a  keep pairs of RI together.
+LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;
  
  # LB31 Break Everywhere Else.
  #      Include combining marks
-LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1:        . CM* ZWJ [^CM];
  LB31.2:        . CM* ÷;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt

index 5a47d8ea7c1b894e68594bb674ac68bb225426cc..0a1772e5d35a55924d581bfaf4a47d7047005fc1 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -5,7 +5,8 @@
  #
  #  file:  line_normal_cj.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
@@ -39,7 +40,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CM_ = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -75,16 +76,13 @@ XX = [:LineBreak =  Unknown:];
  ZW = [:LineBreak =  ZWSpace:];
  ZWJ = [:LineBreak =  ZWJ:];
  
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
  # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
  AL = [AL AI SG XX ];
  dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -116,8 +114,10 @@ LB7.2:      [ZW SP] [SP ZW];
  LB8:        ZW ÷;
  
  # LB8a
-#      ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
+#      ZWJ x
+#      Don't match a CM on the right - let other rules pick up CM sequences, where
+#      the ZWJ behaves as just another generic CM.
+LB8a:       ZWJ [^CM];
  
  
  # LB9:  X CM -> X
@@ -126,7 +126,7 @@ LB8a:       ZWJ (ID | Extended_Pict | EmojiNRK);
  #LB11:       × WJ;
  #            WJ ×
  
-LB11.1:      [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1:      [^SP] CM* WJ;
  LB11.2:      SP WJ;
  LB11.3:      WJ CM* [^CM];
  
@@ -152,12 +152,14 @@ LB19:        . CM* QU;
  LB19.1:      QU CM* [^CM];
  
  # LB 20   Break before and after CB.
-#         Interaction with LB8a:  ZWJ x ID is tricky because CM includes ZWJ.
+#         Interaction with LB8a:  ZWJ x . is tricky because CM includes ZWJ.
  #                                 ZWJ acts like a CM to the left, combining with CB.
-#                                 ZWJ acts independently to the right, no break from ID by LB8a.
-LB20:        . CM* ÷ CB;
-LB20.1a:     CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b:      CB CM* ÷;
+#                                 ZWJ acts independently to the right, no break after by LB8a.
+LB20.1:      . CM* ZWJ CB;
+LB20.2:      . CM* ÷ CB;
+
+LB20.3:      CB CM* ZWJ [^CM];
+LB20.4:      CB CM* ÷;
  
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
@@ -208,15 +210,15 @@ LB29:        IS CM* (AL | HL);
  LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
-# LB31  keep pairs of RI together.
-LB30a.1:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2:     RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3:     RI CM* RI CM* ÷;
+# LB30a  keep pairs of RI together.
+LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
  LB30b:       EB CM* EM;
  
  # LB31 Break Everywhere Else.
  #      Include combining marks
-LB31.1:        . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1:        . CM* ZWJ [^CM];
  LB31.2:        . CM* ÷;
author	Andy Heninger <andy.heninger@gmail.com>
	Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Mon, 21 May 2018 22:40:17 +0000 (22:40 +0000)
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/icutzdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt		patch \| blob \| history