ICU-7270 Line Break rule LB8 updated for UAX-14 conformance. (#41)

author Andy Heninger <andy.heninger@gmail.com>

Thu, 9 Aug 2018 18:28:55 +0000 (11:28 -0700)

committer Shane Carr <shane@unicode.org>

Thu, 27 Sep 2018 21:27:38 +0000 (14:27 -0700)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 9 Aug 2018 18:28:55 +0000 (11:28 -0700)
committer Shane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:38 +0000 (14:27 -0700)
diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp

index a4b9a718682540b0ed0b91f002e42db010d51a10..08c577696c22549f905b0b890c083f6f418436c9 100644 (file)
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@@ -303,17 +303,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
  }
  
  void RBBIRuleBuilder::optimizeTables() {
+    bool didSomething;
+    do {
+        didSomething = false;
+
+        // Begin looking for duplicates with char class 3.
+        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+        // and should not have other categories merged into them.
+        IntPair duplPair = {3, 0};
+        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
+            fSetBuilder->mergeCategories(duplPair);
+            fForwardTable->removeColumn(duplPair.second);
+            didSomething = true;
+        }
  
-    // Begin looking for duplicates with char class 3.
-    // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
-    // and should not have other categories merged into them.
-    IntPair duplPair = {3, 0};
-
-    while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
-        fSetBuilder->mergeCategories(duplPair);
-        fForwardTable->removeColumn(duplPair.second);
-    }
-    fForwardTable->removeDuplicateStates();
+        while (fForwardTable->removeDuplicateStates() > 0) {
+            didSomething = true;
+        }
+    } while (didSomething);
  }
  
  U_NAMESPACE_END
diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp

index 8a6f7c792f33b9339125460f6459515c1b44ff4a..18da5231b97be5a061ddcdd82e3d6b7f0dcf95cb 100644 (file)
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@@ -1245,12 +1245,16 @@ void RBBITableBuilder::removeSafeState(IntPair duplStates) {
  /*
   * RemoveDuplicateStates
   */
-void RBBITableBuilder::removeDuplicateStates() {
+int32_t RBBITableBuilder::removeDuplicateStates() {
      IntPair dupls = {3, 0};
+    int32_t numStatesRemoved = 0;
+
      while (findDuplicateState(&dupls)) {
          // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
          removeState(dupls);
+        ++numStatesRemoved;
      }
+    return numStatesRemoved;
  }
  
  
diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h

index eea243e4cdd6c36bef958838cc2c3f5997729bfc..844f7ecaab0370af309b5ca0a6324bea61d9a22b 100644 (file)
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@@ -66,8 +66,11 @@ public:
       */
      void     removeColumn(int32_t column);
  
-    /** Check for, and remove dupicate states (table rows). */
-    void     removeDuplicateStates();
+    /**
+     * Check for, and remove dupicate states (table rows).
+     * @return the number of states removed.
+     */
+    int32_t  removeDuplicateStates();
  
      /** Build the safe reverse table from the already-constructed forward table. */
      void     buildSafeReverseTable(UErrorCode &status);
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt

index cf945b46f579d902de41fbca392615933d8f165d..9ad81e6fc7da4158d875cf1b074e70e824a5e675 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -132,12 +132,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt

index 92eefffd91be01060e6ad9ff3950d6bc6a657d63..9c26945e58066437190cc702638e1fa6b1db9259 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_fi.txt
@@ -138,12 +138,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt

index 5e8f7a167030dd5de02ad4c4038d0e79a113fc6d..2d72fdfa90742b69ab0aa8fd1791a32439aba820 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -141,12 +141,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt

index 651924120a98c1cf6686194b2d7b629635fbb6ca..024e68ebc77ce161575e5c7c4ea12aa2f237fc4e 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -151,12 +151,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt

index 3775b7f9ac9632f0e7add415b29f285a224a8b8a..0c34b00cf38a7b63c308f88bc269cd756bde9031 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt
@@ -137,12 +137,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt

index 5bfbe8e8d9e58e89a1cb8bfdf13d08319f55df39..b2472177e497430f2ce4634e7b0b8eff81ff720f 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -136,12 +136,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt

index 4576b205e1563bcea24425cd9a44114e605fa457..b4fcf029e727fc51f531dc9bf16ad57e13a9b2bd 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -139,12 +139,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt

index b5efc152aeabf0174eab59b0a28f33ada5cdc6e2..a3eccf2c5b6325b9a6b3a22b4751215a1763df2e 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt
@@ -136,12 +136,11 @@ $CAN_CM $CM*  [$SP $ZW];
  
  #
  # LB 8         Break after zero width space
-#              TODO:  ZW SP* <break>
-#              An engine change is required to write the reverse rule for this.
-#              For now, leave the Unicode 5.2 rule, ZW <break>
+#              ZW SP* ÷
  #
  $LB8Breaks    = [$LB4Breaks $ZW];
  $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
  
  # LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
  #
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 5463dad247181dd96ddc4607b3bb168d216af527..acf6a57779cfd8d5ec5bf9e46de6cd0a02a1665c 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -1283,35 +1283,28 @@ void RBBITest::TestUnicodeFiles() {
  
  
  // Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.
  
  UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
      static struct TestCase {
+        const char *fTicketNum;
          const char *fFileName;
          const UChar *fString;
-    } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
-        {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
-        {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
-        {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
-        {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
-        {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
-        {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
-                                                        // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
-        {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
-
-                                                        // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
-        {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
-        {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
+    } badTestCases[] = {
+        {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}    // Fake example, for illustration.
      };
  
      for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
          const TestCase &badCase = badTestCases[n];
          if (!strcmp(fileName, badCase.fFileName) &&
                  testCase == UnicodeString(badCase.fString)) {
-            return logKnownIssue("7270");
+            return logKnownIssue(badCase.fTicketNum);
          }
      }
      return FALSE;
@@ -2550,7 +2543,7 @@ private:
      UnicodeSet  *fXX;
      UnicodeSet  *fEB;
      UnicodeSet  *fEM;
-    UnicodeSet  *fZJ;
+    UnicodeSet  *fZWJ;
  
      BreakIterator        *fCharBI;
      const UnicodeString  *fText;
@@ -2615,7 +2608,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
      fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
      fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
-    fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+    fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -2627,7 +2620,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
  
      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
-    fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
+    fCM->addAll(*fZWJ);     // ZWJ behaves as a CM.
  
      fSets->addElement(fBK, status);
      fSets->addElement(fCR, status);
@@ -2669,7 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fSets->addElement(fSG, status);
      fSets->addElement(fEB, status);
      fSets->addElement(fEM, status);
-    fSets->addElement(fZJ, status);
+    fSets->addElement(fZWJ, status);
  
  
      const char *rules =
@@ -2853,7 +2846,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          }
  
          // LB 8  Break after zero width space
-        if (fZW->contains(prevChar)) {
+        //       ZW SP* ÷
+        //       Scan backwards from prevChar for SP* ZW
+        tPos = prevPos;
+        while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+            tPos = fText->moveIndex32(tPos, -1);
+        }
+        if (fZW->contains(fText->char32At(tPos))) {
              break;
          }
  
@@ -2890,7 +2889,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          {
              int32_t prevIdx = fText->moveIndex32(pos, -1);
              UChar32 prevC = fText->char32At(prevIdx);
-            if (fZJ->contains(prevC)) {
+            if (fZWJ->contains(prevC)) {
                  continue;
              }
          }
@@ -3148,12 +3147,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              continue;
          }
  
-        // LB30a    RI RI <break> RI
-        //             RI    x    RI
+        // LB30a    RI RI  ÷  RI
+        //             RI  x  RI
          if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
              break;
          }
          if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+            // Two Regional Indicators have been paired.
+            // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+            // following RI. This is a hack.
+            thisChar = -1;
              continue;
          }
  
@@ -3220,7 +3223,7 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fXX;
      delete fEB;
      delete fEM;
-    delete fZJ;
+    delete fZWJ;
  
      delete fCharBI;
      delete fNumberMatcher;
diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt

index 31d27b90dd4531ec586c050ca1c41be35e017e27..d0f9abe88a1edb32dd75a04f279b0ba698d710a8 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line.txt
+++ b/icu4c/source/test/testdata/break_rules/line.txt
@@ -25,7 +25,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -66,7 +66,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -86,14 +86,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -188,8 +190,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt

index 00552fec7c06dd4c4fd3fea30acb4698edce42f4..2384fa296c87c8857b1ffd7812efcd3f8fc11795 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_loose.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose.txt
@@ -32,7 +32,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -74,7 +74,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -94,14 +94,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -196,8 +198,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt

index 59c21fd3d9a37de1fffe62fdc0235d7672ce68c3..8b92561dbd8fa6b49a0286eb643b038ecc3860d7 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@@ -46,7 +46,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -91,7 +91,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -111,14 +111,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -217,8 +219,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt

index 7cf2613a85511feac0a59cad20846375cad7cc9c..65804d83f9bf787a2d561e8f670929c7f40dc584 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_normal.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal.txt
@@ -39,7 +39,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -80,7 +80,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -100,14 +100,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -202,8 +204,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt

index 3af4a582db1cd72225cf9a21b483e8e74e1e2b61..b50219282b26c9a131ef577f07e40c455deb1fcd 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@@ -40,7 +40,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -82,7 +82,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -105,14 +105,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -211,8 +213,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

index f83b295146b3e22e98cf3d267a88ccfe4e1d29e6..293856471f49cf8c640bfacae90136951262583f 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -331,14 +331,21 @@ class RBBIRuleBuilder {
      }
  
      void optimizeTables() {
-        // Begin looking for duplicates with char class 3.
-        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
-        // and should not have other categories merged into them.
-        IntPair duplPair = new IntPair(3, 0);
-        while (fForwardTable.findDuplCharClassFrom(duplPair)) {
-            fSetBuilder.mergeCategories(duplPair);
-            fForwardTable.removeColumn(duplPair.second);
-        }
-        fForwardTable.removeDuplicateStates();
+        boolean didSomething;
+        do {
+            didSomething = false;
+            // Begin looking for duplicates with char class 3.
+            // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+            // and should not have other categories merged into them.
+            IntPair duplPair = new IntPair(3, 0);
+            while (fForwardTable.findDuplCharClassFrom(duplPair)) {
+                fSetBuilder.mergeCategories(duplPair);
+                fForwardTable.removeColumn(duplPair.second);
+                didSomething = true;
+            }
+            while (fForwardTable.removeDuplicateStates() > 0) {
+                didSomething = true;
+            };
+        } while (didSomething);
      }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

index dad7aacd8d8ea7543a67a4fde672c7ca62177faa..9ccafe8d7ffa5cffd1a7a722c02a393911f7e9c7 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
@@ -1032,14 +1032,19 @@ class RBBITableBuilder {
  
         /**
          *  Check for, and remove duplicate states (table rows).
+        *  @return the number of states removed.
          *  @internal
          */
-       void removeDuplicateStates() {
+       int removeDuplicateStates() {
             IntPair dupls = new IntPair(3, 0);
+           int numStatesRemoved = 0;
+
             while (findDuplicateState(dupls)) {
                 // System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
                 removeState(dupls);
+               ++numStatesRemoved;
             }
+           return numStatesRemoved;
         }
  
  
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 15fb7409ef6f7252cb47d3674b45e6f08476621a..93c64fdb11fb491677b3d03edfc5122c7e5d2d32 100755 (executable)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0
-size 12508534
+oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143
+size 12510210
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar

index 5aed7afb443bcc8efbe579744c307ba7b69c7600..8b02fe62204a96c2a0b41168879ab750887ef3cf 100755 (executable)
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac
+oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634
  size 92857
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index 333da86099c1a6fb7fce31d273584f02ff5f0fd8..fa72431dd075eb7b391be61292fd8ea7bb208895 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -651,54 +651,68 @@ public class RBBITestMonkey extends TestFmwk {
          int           fOrigPositions;
  
  
+        // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
+        // throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
+        // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
+        // to represent a non-codepoint that is not included in any of the property sets.
+        // This happens for rule 30a.
+
+        class XUnicodeSet extends UnicodeSet {
+            XUnicodeSet(String pattern) { super(pattern); }
+            @Override
+            public boolean contains(int codePoint) {
+                return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
+                        false : super.contains(codePoint);
+            }
+        }
  
          RBBILineMonkey()
          {
              fCharProperty  = UProperty.LINE_BREAK;
              fSets          = new ArrayList();
  
-            fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
-            fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
-            fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
-            fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
-            fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
-            fSG    = new UnicodeSet("[\\ud800-\\udfff]");
-            fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
-            fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
-            fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
-            fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
-            fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
-            fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
-            fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
-            fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
-            fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
-            fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
-            fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
-            fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
-            fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
-            fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
-            fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
-            fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
-            fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
-            fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
-            fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
-            fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
-            fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
-            fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
-            fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
-            fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
-            fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
-            fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
-            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
-            fID    = new UnicodeSet("[\\p{Line_break=ID}]");
-            fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
-            fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
-            fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
-            fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
-            fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
-            fEB    = new UnicodeSet("[\\p{Line_break=EB}]");
-            fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
-            fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
+            fBK    = new XUnicodeSet("[\\p{Line_Break=BK}]");
+            fCR    = new XUnicodeSet("[\\p{Line_break=CR}]");
+            fLF    = new XUnicodeSet("[\\p{Line_break=LF}]");
+            fCM    = new XUnicodeSet("[\\p{Line_break=CM}]");
+            fNL    = new XUnicodeSet("[\\p{Line_break=NL}]");
+            fSG    = new XUnicodeSet("[\\ud800-\\udfff]");
+            fWJ    = new XUnicodeSet("[\\p{Line_break=WJ}]");
+            fZW    = new XUnicodeSet("[\\p{Line_break=ZW}]");
+            fGL    = new XUnicodeSet("[\\p{Line_break=GL}]");
+            fSP    = new XUnicodeSet("[\\p{Line_break=SP}]");
+            fB2    = new XUnicodeSet("[\\p{Line_break=B2}]");
+            fBA    = new XUnicodeSet("[\\p{Line_break=BA}]");
+            fBB    = new XUnicodeSet("[\\p{Line_break=BB}]");
+            fHY    = new XUnicodeSet("[\\p{Line_break=HY}]");
+            fCB    = new XUnicodeSet("[\\p{Line_break=CB}]");
+            fCL    = new XUnicodeSet("[\\p{Line_break=CL}]");
+            fCP    = new XUnicodeSet("[\\p{Line_break=CP}]");
+            fEX    = new XUnicodeSet("[\\p{Line_break=EX}]");
+            fIN    = new XUnicodeSet("[\\p{Line_break=IN}]");
+            fNS    = new XUnicodeSet("[\\p{Line_break=NS}]");
+            fOP    = new XUnicodeSet("[\\p{Line_break=OP}]");
+            fQU    = new XUnicodeSet("[\\p{Line_break=QU}]");
+            fIS    = new XUnicodeSet("[\\p{Line_break=IS}]");
+            fNU    = new XUnicodeSet("[\\p{Line_break=NU}]");
+            fPO    = new XUnicodeSet("[\\p{Line_break=PO}]");
+            fPR    = new XUnicodeSet("[\\p{Line_break=PR}]");
+            fSY    = new XUnicodeSet("[\\p{Line_break=SY}]");
+            fAI    = new XUnicodeSet("[\\p{Line_break=AI}]");
+            fAL    = new XUnicodeSet("[\\p{Line_break=AL}]");
+            fCJ    = new XUnicodeSet("[\\p{Line_break=CJ}]");
+            fH2    = new XUnicodeSet("[\\p{Line_break=H2}]");
+            fH3    = new XUnicodeSet("[\\p{Line_break=H3}]");
+            fHL    = new XUnicodeSet("[\\p{Line_break=HL}]");
+            fID    = new XUnicodeSet("[\\p{Line_break=ID}]");
+            fJL    = new XUnicodeSet("[\\p{Line_break=JL}]");
+            fJV    = new XUnicodeSet("[\\p{Line_break=JV}]");
+            fJT    = new XUnicodeSet("[\\p{Line_break=JT}]");
+            fRI    = new XUnicodeSet("[\\p{Line_break=RI}]");
+            fXX    = new XUnicodeSet("[\\p{Line_break=XX}]");
+            fEB    = new XUnicodeSet("[\\p{Line_break=EB}]");
+            fEM    = new XUnicodeSet("[\\p{Line_break=EM}]");
+            fZWJ   = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
  
              // Remove dictionary characters.
              // The monkey test reference implementation of line break does not replicate the dictionary behavior,
@@ -886,7 +900,13 @@ public class RBBITestMonkey extends TestFmwk {
                  }
  
                  // LB 8  Break after zero width space
-                if (fZW.contains(prevChar)) {
+                //       ZW SP* ÷
+                //       Scan backwards from prevChar for SP* ZW
+                tPos = prevPos;
+                while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
+                    tPos = moveIndex32(fText, tPos, -1);
+                }
+                if (fZW.contains(UTF16.charAt(fText, tPos))) {
                      break;
                  }
  
@@ -1166,12 +1186,16 @@ public class RBBITestMonkey extends TestFmwk {
                  }
  
                  // LB 30a   Break between pairs of Regional Indicators.
-                //             RI RI <break> RI
-                //             RI    x    RI
+                //             RI RI  ÷  RI
+                //                RI  x  RI
                  if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
                      break;
                  }
                  if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
+                    // Two Regional Indicators have been paired.
+                    // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+                    // following RI. This is a hack.
+                    thisChar = -1;
                      continue;
                  }
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt

index b478bf9b8e4fcee37fb70a4af0af18281693ce64..d0f9abe88a1edb32dd75a04f279b0ba698d710a8 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@@ -25,7 +25,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -66,7 +66,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -86,14 +86,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -188,8 +190,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt

index 1ef43f9fa2761b64e4155889d4255a3c21f09995..2384fa296c87c8857b1ffd7812efcd3f8fc11795 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@@ -32,7 +32,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -74,7 +74,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -94,14 +94,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -196,8 +198,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt

index 4227de8d3bb3be62f50338861a15da816cc0ba74..8b92561dbd8fa6b49a0286eb643b038ecc3860d7 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -46,7 +46,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -91,7 +91,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -111,14 +111,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -217,8 +219,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt

index 5952d5bc4498f770b9b9b549ed55c2e11d003536..65804d83f9bf787a2d561e8f670929c7f40dc584 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@@ -39,7 +39,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -80,7 +80,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -100,14 +100,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -202,8 +204,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt

index 0a1772e5d35a55924d581bfaf4a47d7047005fc1..b50219282b26c9a131ef577f07e40c455deb1fcd 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -40,7 +40,7 @@ B2 = [:LineBreak =  Break_Both:];
  CB = [:LineBreak =  Contingent_Break:];
  CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  CL = [:LineBreak =  Close_Punctuation:];
-CM_ = [:LineBreak =  Combining_Mark:];
+CMS = [:LineBreak =  Combining_Mark:];
  CP = [:LineBreak =  Close_Parenthesis:];
  CR = [:LineBreak =  Carriage_Return:];
  EB = [:LineBreak =  EB:];
@@ -82,7 +82,7 @@ dictionary = SA;
  
  # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
  #         list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
  
  LB4:        BK ÷;
  LB5:        CR LF;
@@ -105,14 +105,16 @@ LB15:        QU CM* SP* OP;
  LB16:        (CL | CP)CM* SP* NS;
  LB17:        B2 CM* SP* B2;
  
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8:        ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
  LB7.1:      [^ZW SP] CM* [SP ZW];
  LB7.2:      [ZW SP] [SP ZW];
  
-# LB8, ICU differs from UAX-14,
-#    ICU:    ZW ÷;
-#    UAX 14: ZW SP* ÷;
-LB8:        ZW ÷;
-
  # LB8a
  #      ZWJ x
  #      Don't match a CM on the right - let other rules pick up CM sequences, where
@@ -211,8 +213,8 @@ LB30.1:      (AL | CM | HL | NU) CM* OP;
  LB30.2:      CP CM* (AL | HL | NU);
  
  # LB30a  keep pairs of RI together.
-LB30a.1:     RI CM* RI              ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2:     RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1:     RI CM* RI         ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2:     RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
  LB30a.3:     RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
  
  # LB30b Do not break between Emoji Base and Emoji Modifier
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 9 Aug 2018 18:28:55 +0000 (11:28 -0700)
committer	Shane Carr <shane@unicode.org>
	Thu, 27 Sep 2018 21:27:38 +0000 (14:27 -0700)
icu4c/source/common/rbbirb.cpp		patch \| blob \| history
icu4c/source/common/rbbitblb.cpp		patch \| blob \| history
icu4c/source/common/rbbitblb.h		patch \| blob \| history
icu4c/source/data/brkitr/rules/line.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_fi.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_fi.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_fi.txt		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_loose.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_normal.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_normal_cj.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java		patch \| blob \| history
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/icutzdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt		patch \| blob \| history