From fa5ae3dc455cf273907d5b52eb8cef49eb59a917 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 9 Aug 2018 11:28:55 -0700 Subject: [PATCH] ICU-7270 Line Break rule LB8 updated for UAX-14 conformance. (#41) Includes all line break tailorings. Corresponding updates to monkey test rules. State table builder, fix missed table optimization, uncovered by new rule. --- icu4c/source/common/rbbirb.cpp | 27 +++-- icu4c/source/common/rbbitblb.cpp | 6 +- icu4c/source/common/rbbitblb.h | 7 +- icu4c/source/data/brkitr/rules/line.txt | 5 +- icu4c/source/data/brkitr/rules/line_fi.txt | 5 +- icu4c/source/data/brkitr/rules/line_loose.txt | 5 +- .../data/brkitr/rules/line_loose_cj.txt | 5 +- .../data/brkitr/rules/line_loose_fi.txt | 5 +- .../source/data/brkitr/rules/line_normal.txt | 5 +- .../data/brkitr/rules/line_normal_cj.txt | 5 +- .../data/brkitr/rules/line_normal_fi.txt | 5 +- icu4c/source/test/intltest/rbbitst.cpp | 57 ++++----- .../source/test/testdata/break_rules/line.txt | 20 +-- .../test/testdata/break_rules/line_loose.txt | 20 +-- .../testdata/break_rules/line_loose_cj.txt | 20 +-- .../test/testdata/break_rules/line_normal.txt | 20 +-- .../testdata/break_rules/line_normal_cj.txt | 20 +-- .../src/com/ibm/icu/text/RBBIRuleBuilder.java | 25 ++-- .../com/ibm/icu/text/RBBITableBuilder.java | 7 +- icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/icutzdata.jar | 2 +- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 114 +++++++++++------- .../icu/dev/test/rbbi/break_rules/line.txt | 20 +-- .../dev/test/rbbi/break_rules/line_loose.txt | 20 +-- .../test/rbbi/break_rules/line_loose_cj.txt | 20 +-- .../dev/test/rbbi/break_rules/line_normal.txt | 20 +-- .../test/rbbi/break_rules/line_normal_cj.txt | 20 +-- 27 files changed, 277 insertions(+), 212 deletions(-) diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index a4b9a718682..08c577696c2 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -303,17 +303,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { } void RBBIRuleBuilder::optimizeTables() { + bool didSomething; + do { + didSomething = false; + + // Begin looking for duplicates with char class 3. + // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, + // and should not have other categories merged into them. + IntPair duplPair = {3, 0}; + while (fForwardTable->findDuplCharClassFrom(&duplPair)) { + fSetBuilder->mergeCategories(duplPair); + fForwardTable->removeColumn(duplPair.second); + didSomething = true; + } - // Begin looking for duplicates with char class 3. - // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, - // and should not have other categories merged into them. - IntPair duplPair = {3, 0}; - - while (fForwardTable->findDuplCharClassFrom(&duplPair)) { - fSetBuilder->mergeCategories(duplPair); - fForwardTable->removeColumn(duplPair.second); - } - fForwardTable->removeDuplicateStates(); + while (fForwardTable->removeDuplicateStates() > 0) { + didSomething = true; + } + } while (didSomething); } U_NAMESPACE_END diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 8a6f7c792f3..18da5231b97 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -1245,12 +1245,16 @@ void RBBITableBuilder::removeSafeState(IntPair duplStates) { /* * RemoveDuplicateStates */ -void RBBITableBuilder::removeDuplicateStates() { +int32_t RBBITableBuilder::removeDuplicateStates() { IntPair dupls = {3, 0}; + int32_t numStatesRemoved = 0; + while (findDuplicateState(&dupls)) { // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second); removeState(dupls); + ++numStatesRemoved; } + return numStatesRemoved; } diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index eea243e4cdd..844f7ecaab0 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -66,8 +66,11 @@ public: */ void removeColumn(int32_t column); - /** Check for, and remove dupicate states (table rows). */ - void removeDuplicateStates(); + /** + * Check for, and remove dupicate states (table rows). + * @return the number of states removed. + */ + int32_t removeDuplicateStates(); /** Build the safe reverse table from the already-constructed forward table. */ void buildSafeReverseTable(UErrorCode &status); diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index cf945b46f57..9ad81e6fc7d 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -132,12 +132,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt index 92eefffd91b..9c26945e580 100644 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_fi.txt @@ -138,12 +138,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 5e8f7a16703..2d72fdfa907 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -141,12 +141,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 651924120a9..024e68ebc77 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -151,12 +151,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt index 3775b7f9ac9..0c34b00cf38 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt @@ -137,12 +137,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index 5bfbe8e8d9e..b2472177e49 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 4576b205e15..b4fcf029e72 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -139,12 +139,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt index b5efc152aea..a3eccf2c5b6 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt @@ -136,12 +136,11 @@ $CAN_CM $CM* [$SP $ZW]; # # LB 8 Break after zero width space -# TODO: ZW SP* -# An engine change is required to write the reverse rule for this. -# For now, leave the Unicode 5.2 rule, ZW +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; # LB 8a ZWJ x Do not break Emoji ZWJ sequences. # diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 5463dad2471..acf6a57779c 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1283,35 +1283,28 @@ void RBBITest::TestUnicodeFiles() { // Check for test cases from the Unicode test data files that are known to fail -// and should be skipped because ICU is not yet able to fully implement the spec. -// See ticket #7270. +// and should be skipped as known issues because ICU does not fully implement +// the Unicode specifications. +// +// Test cases are identified by the test data sequence, which tends to be more stable +// across Unicode versions than the test file line numbers. +// +// The test case with ticket "10666" is a dummy, included as an example. UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { static struct TestCase { + const char *fTicketNum; const char *fFileName; const UChar *fString; - } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file. - {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198 - {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202 - {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214 - {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246 - {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298 - {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302 - // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt - {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ - {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG - {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier - - // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt - {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK - {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK + } badTestCases[] = { + {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration. }; for (int n=0; naddAll(*fSG); // Default behavior for SG is identical to AL. fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. - fCM->addAll(*fZJ); // ZWJ behaves as a CM. + fCM->addAll(*fZWJ); // ZWJ behaves as a CM. fSets->addElement(fBK, status); fSets->addElement(fCR, status); @@ -2669,7 +2662,7 @@ RBBILineMonkey::RBBILineMonkey() : fSets->addElement(fSG, status); fSets->addElement(fEB, status); fSets->addElement(fEM, status); - fSets->addElement(fZJ, status); + fSets->addElement(fZWJ, status); const char *rules = @@ -2853,7 +2846,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } // LB 8 Break after zero width space - if (fZW->contains(prevChar)) { + // ZW SP* ÷ + // Scan backwards from prevChar for SP* ZW + tPos = prevPos; + while (tPos>0 && fSP->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + if (fZW->contains(fText->char32At(tPos))) { break; } @@ -2890,7 +2889,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { { int32_t prevIdx = fText->moveIndex32(pos, -1); UChar32 prevC = fText->char32At(prevIdx); - if (fZJ->contains(prevC)) { + if (fZWJ->contains(prevC)) { continue; } } @@ -3148,12 +3147,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - // LB30a RI RI RI - // RI x RI + // LB30a RI RI ÷ RI + // RI x RI if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { break; } if (fRI->contains(prevChar) && fRI->contains(thisChar)) { + // Two Regional Indicators have been paired. + // Over-write the trailing one (thisChar) to prevent it from forming another pair with a + // following RI. This is a hack. + thisChar = -1; continue; } @@ -3220,7 +3223,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fXX; delete fEB; delete fEM; - delete fZJ; + delete fZWJ; delete fCharBI; delete fNumberMatcher; diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 31d27b90dd4..d0f9abe88a1 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -66,7 +66,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -86,14 +86,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -188,8 +190,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 00552fec7c0..2384fa296c8 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -74,7 +74,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -94,14 +94,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -196,8 +198,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 59c21fd3d9a..8b92561dbd8 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -91,7 +91,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -111,14 +111,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -217,8 +219,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 7cf2613a855..65804d83f9b 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -80,7 +80,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -100,14 +100,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -202,8 +204,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index 3af4a582db1..b50219282b2 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -82,7 +82,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -105,14 +105,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -211,8 +213,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index f83b295146b..293856471f4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -331,14 +331,21 @@ class RBBIRuleBuilder { } void optimizeTables() { - // Begin looking for duplicates with char class 3. - // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, - // and should not have other categories merged into them. - IntPair duplPair = new IntPair(3, 0); - while (fForwardTable.findDuplCharClassFrom(duplPair)) { - fSetBuilder.mergeCategories(duplPair); - fForwardTable.removeColumn(duplPair.second); - } - fForwardTable.removeDuplicateStates(); + boolean didSomething; + do { + didSomething = false; + // Begin looking for duplicates with char class 3. + // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, + // and should not have other categories merged into them. + IntPair duplPair = new IntPair(3, 0); + while (fForwardTable.findDuplCharClassFrom(duplPair)) { + fSetBuilder.mergeCategories(duplPair); + fForwardTable.removeColumn(duplPair.second); + didSomething = true; + } + while (fForwardTable.removeDuplicateStates() > 0) { + didSomething = true; + }; + } while (didSomething); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index dad7aacd8d8..9ccafe8d7ff 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -1032,14 +1032,19 @@ class RBBITableBuilder { /** * Check for, and remove duplicate states (table rows). + * @return the number of states removed. * @internal */ - void removeDuplicateStates() { + int removeDuplicateStates() { IntPair dupls = new IntPair(3, 0); + int numStatesRemoved = 0; + while (findDuplicateState(dupls)) { // System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second); removeState(dupls); + ++numStatesRemoved; } + return numStatesRemoved; } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 15fb7409ef6..93c64fdb11f 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0 -size 12508534 +oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143 +size 12510210 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 5aed7afb443..8b02fe62204 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac +oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634 size 92857 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 333da86099c..fa72431dd07 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -651,54 +651,68 @@ public class RBBITestMonkey extends TestFmwk { int fOrigPositions; + // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not + // throw exceptions on out-of-range codePoints. This matches ICU4C behavior. + // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1 + // to represent a non-codepoint that is not included in any of the property sets. + // This happens for rule 30a. + + class XUnicodeSet extends UnicodeSet { + XUnicodeSet(String pattern) { super(pattern); } + @Override + public boolean contains(int codePoint) { + return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ? + false : super.contains(codePoint); + } + } RBBILineMonkey() { fCharProperty = UProperty.LINE_BREAK; fSets = new ArrayList(); - fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); - fCR = new UnicodeSet("[\\p{Line_break=CR}]"); - fLF = new UnicodeSet("[\\p{Line_break=LF}]"); - fCM = new UnicodeSet("[\\p{Line_break=CM}]"); - fNL = new UnicodeSet("[\\p{Line_break=NL}]"); - fSG = new UnicodeSet("[\\ud800-\\udfff]"); - fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); - fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); - fGL = new UnicodeSet("[\\p{Line_break=GL}]"); - fSP = new UnicodeSet("[\\p{Line_break=SP}]"); - fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); - fBA = new UnicodeSet("[\\p{Line_break=BA}]"); - fBB = new UnicodeSet("[\\p{Line_break=BB}]"); - fHY = new UnicodeSet("[\\p{Line_break=HY}]"); - fCB = new UnicodeSet("[\\p{Line_break=CB}]"); - fCL = new UnicodeSet("[\\p{Line_break=CL}]"); - fCP = new UnicodeSet("[\\p{Line_break=CP}]"); - fEX = new UnicodeSet("[\\p{Line_break=EX}]"); - fIN = new UnicodeSet("[\\p{Line_break=IN}]"); - fNS = new UnicodeSet("[\\p{Line_break=NS}]"); - fOP = new UnicodeSet("[\\p{Line_break=OP}]"); - fQU = new UnicodeSet("[\\p{Line_break=QU}]"); - fIS = new UnicodeSet("[\\p{Line_break=IS}]"); - fNU = new UnicodeSet("[\\p{Line_break=NU}]"); - fPO = new UnicodeSet("[\\p{Line_break=PO}]"); - fPR = new UnicodeSet("[\\p{Line_break=PR}]"); - fSY = new UnicodeSet("[\\p{Line_break=SY}]"); - fAI = new UnicodeSet("[\\p{Line_break=AI}]"); - fAL = new UnicodeSet("[\\p{Line_break=AL}]"); - fCJ = new UnicodeSet("[\\p{Line_break=CJ}]"); - fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); - fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); - fHL = new UnicodeSet("[\\p{Line_break=HL}]"); - fID = new UnicodeSet("[\\p{Line_break=ID}]"); - fJL = new UnicodeSet("[\\p{Line_break=JL}]"); - fJV = new UnicodeSet("[\\p{Line_break=JV}]"); - fJT = new UnicodeSet("[\\p{Line_break=JT}]"); - fRI = new UnicodeSet("[\\p{Line_break=RI}]"); - fXX = new UnicodeSet("[\\p{Line_break=XX}]"); - fEB = new UnicodeSet("[\\p{Line_break=EB}]"); - fEM = new UnicodeSet("[\\p{Line_break=EM}]"); - fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]"); + fBK = new XUnicodeSet("[\\p{Line_Break=BK}]"); + fCR = new XUnicodeSet("[\\p{Line_break=CR}]"); + fLF = new XUnicodeSet("[\\p{Line_break=LF}]"); + fCM = new XUnicodeSet("[\\p{Line_break=CM}]"); + fNL = new XUnicodeSet("[\\p{Line_break=NL}]"); + fSG = new XUnicodeSet("[\\ud800-\\udfff]"); + fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]"); + fZW = new XUnicodeSet("[\\p{Line_break=ZW}]"); + fGL = new XUnicodeSet("[\\p{Line_break=GL}]"); + fSP = new XUnicodeSet("[\\p{Line_break=SP}]"); + fB2 = new XUnicodeSet("[\\p{Line_break=B2}]"); + fBA = new XUnicodeSet("[\\p{Line_break=BA}]"); + fBB = new XUnicodeSet("[\\p{Line_break=BB}]"); + fHY = new XUnicodeSet("[\\p{Line_break=HY}]"); + fCB = new XUnicodeSet("[\\p{Line_break=CB}]"); + fCL = new XUnicodeSet("[\\p{Line_break=CL}]"); + fCP = new XUnicodeSet("[\\p{Line_break=CP}]"); + fEX = new XUnicodeSet("[\\p{Line_break=EX}]"); + fIN = new XUnicodeSet("[\\p{Line_break=IN}]"); + fNS = new XUnicodeSet("[\\p{Line_break=NS}]"); + fOP = new XUnicodeSet("[\\p{Line_break=OP}]"); + fQU = new XUnicodeSet("[\\p{Line_break=QU}]"); + fIS = new XUnicodeSet("[\\p{Line_break=IS}]"); + fNU = new XUnicodeSet("[\\p{Line_break=NU}]"); + fPO = new XUnicodeSet("[\\p{Line_break=PO}]"); + fPR = new XUnicodeSet("[\\p{Line_break=PR}]"); + fSY = new XUnicodeSet("[\\p{Line_break=SY}]"); + fAI = new XUnicodeSet("[\\p{Line_break=AI}]"); + fAL = new XUnicodeSet("[\\p{Line_break=AL}]"); + fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]"); + fH2 = new XUnicodeSet("[\\p{Line_break=H2}]"); + fH3 = new XUnicodeSet("[\\p{Line_break=H3}]"); + fHL = new XUnicodeSet("[\\p{Line_break=HL}]"); + fID = new XUnicodeSet("[\\p{Line_break=ID}]"); + fJL = new XUnicodeSet("[\\p{Line_break=JL}]"); + fJV = new XUnicodeSet("[\\p{Line_break=JV}]"); + fJT = new XUnicodeSet("[\\p{Line_break=JT}]"); + fRI = new XUnicodeSet("[\\p{Line_break=RI}]"); + fXX = new XUnicodeSet("[\\p{Line_break=XX}]"); + fEB = new XUnicodeSet("[\\p{Line_break=EB}]"); + fEM = new XUnicodeSet("[\\p{Line_break=EM}]"); + fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]"); // Remove dictionary characters. // The monkey test reference implementation of line break does not replicate the dictionary behavior, @@ -886,7 +900,13 @@ public class RBBITestMonkey extends TestFmwk { } // LB 8 Break after zero width space - if (fZW.contains(prevChar)) { + // ZW SP* ÷ + // Scan backwards from prevChar for SP* ZW + tPos = prevPos; + while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { + tPos = moveIndex32(fText, tPos, -1); + } + if (fZW.contains(UTF16.charAt(fText, tPos))) { break; } @@ -1166,12 +1186,16 @@ public class RBBITestMonkey extends TestFmwk { } // LB 30a Break between pairs of Regional Indicators. - // RI RI RI - // RI x RI + // RI RI ÷ RI + // RI x RI if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { break; } if (fRI.contains(prevChar) && fRI.contains(thisChar)) { + // Two Regional Indicators have been paired. + // Over-write the trailing one (thisChar) to prevent it from forming another pair with a + // following RI. This is a hack. + thisChar = -1; continue; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index b478bf9b8e4..d0f9abe88a1 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -25,7 +25,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM_ = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -66,7 +66,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM_ ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -86,14 +86,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -188,8 +190,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 1ef43f9fa27..2384fa296c8 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -32,7 +32,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM_ = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -74,7 +74,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM_ ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -94,14 +94,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -196,8 +198,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 4227de8d3bb..8b92561dbd8 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -46,7 +46,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM_ = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -91,7 +91,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM_ ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -111,14 +111,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -217,8 +219,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 5952d5bc449..65804d83f9b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -39,7 +39,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM_ = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -80,7 +80,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM_ ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -100,14 +100,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -202,8 +204,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index 0a1772e5d35..b50219282b2 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -40,7 +40,7 @@ B2 = [:LineBreak = Break_Both:]; CB = [:LineBreak = Contingent_Break:]; CJ = [:LineBreak = Conditional_Japanese_Starter:]; CL = [:LineBreak = Close_Punctuation:]; -CM_ = [:LineBreak = Combining_Mark:]; +CMS = [:LineBreak = Combining_Mark:]; CP = [:LineBreak = Close_Parenthesis:]; CR = [:LineBreak = Carriage_Return:]; EB = [:LineBreak = EB:]; @@ -82,7 +82,7 @@ dictionary = SA; # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. -CM = [CM_ ZWJ]; +CM = [CMS ZWJ]; LB4: BK ÷; LB5: CR LF; @@ -105,14 +105,16 @@ LB15: QU CM* SP* OP; LB16: (CL | CP)CM* SP* NS; LB17: B2 CM* SP* B2; +# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP, +# and LB8 should take precedence. + +LB8: ZW SP* ÷ [^ZW SP BK CR LF NL]; + +# LB7 Do not break before spaces or zero width space. + LB7.1: [^ZW SP] CM* [SP ZW]; LB7.2: [ZW SP] [SP ZW]; -# LB8, ICU differs from UAX-14, -# ICU: ZW ÷; -# UAX 14: ZW SP* ÷; -LB8: ZW ÷; - # LB8a # ZWJ x # Don't match a CM on the right - let other rules pick up CM sequences, where @@ -211,8 +213,8 @@ LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier -- 2.40.0