Includes all line break tailorings.
Corresponding updates to monkey test rules.
State table builder, fix missed table optimization, uncovered by new rule.
}
void RBBIRuleBuilder::optimizeTables() {
+ bool didSomething;
+ do {
+ didSomething = false;
+
+ // Begin looking for duplicates with char class 3.
+ // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+ // and should not have other categories merged into them.
+ IntPair duplPair = {3, 0};
+ while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
+ fSetBuilder->mergeCategories(duplPair);
+ fForwardTable->removeColumn(duplPair.second);
+ didSomething = true;
+ }
- // Begin looking for duplicates with char class 3.
- // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
- // and should not have other categories merged into them.
- IntPair duplPair = {3, 0};
-
- while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
- fSetBuilder->mergeCategories(duplPair);
- fForwardTable->removeColumn(duplPair.second);
- }
- fForwardTable->removeDuplicateStates();
+ while (fForwardTable->removeDuplicateStates() > 0) {
+ didSomething = true;
+ }
+ } while (didSomething);
}
U_NAMESPACE_END
/*
* RemoveDuplicateStates
*/
-void RBBITableBuilder::removeDuplicateStates() {
+int32_t RBBITableBuilder::removeDuplicateStates() {
IntPair dupls = {3, 0};
+ int32_t numStatesRemoved = 0;
+
while (findDuplicateState(&dupls)) {
// printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls);
+ ++numStatesRemoved;
}
+ return numStatesRemoved;
}
*/
void removeColumn(int32_t column);
- /** Check for, and remove dupicate states (table rows). */
- void removeDuplicateStates();
+ /**
+ * Check for, and remove dupicate states (table rows).
+ * @return the number of states removed.
+ */
+ int32_t removeDuplicateStates();
/** Build the safe reverse table from the already-constructed forward table. */
void buildSafeReverseTable(UErrorCode &status);
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
#
# LB 8 Break after zero width space
-# TODO: ZW SP* <break>
-# An engine change is required to write the reverse rule for this.
-# For now, leave the Unicode 5.2 rule, ZW <break>
+# ZW SP* ÷
#
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
+$ZW $SP* / [^$SP $ZW $LB4Breaks];
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
// Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
static struct TestCase {
+ const char *fTicketNum;
const char *fFileName;
const UChar *fString;
- } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
- {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
- {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
- {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
- {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
- {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
- {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
- // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
- {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
- {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
- {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
-
- // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
- {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
- {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
+ } badTestCases[] = {
+ {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration.
};
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
const TestCase &badCase = badTestCases[n];
if (!strcmp(fileName, badCase.fFileName) &&
testCase == UnicodeString(badCase.fString)) {
- return logKnownIssue("7270");
+ return logKnownIssue(badCase.fTicketNum);
}
}
return FALSE;
UnicodeSet *fXX;
UnicodeSet *fEB;
UnicodeSet *fEM;
- UnicodeSet *fZJ;
+ UnicodeSet *fZWJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
- fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+ fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
if (U_FAILURE(status)) {
deferredStatus = status;
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
- fCM->addAll(*fZJ); // ZWJ behaves as a CM.
+ fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fSG, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
- fSets->addElement(fZJ, status);
+ fSets->addElement(fZWJ, status);
const char *rules =
}
// LB 8 Break after zero width space
- if (fZW->contains(prevChar)) {
+ // ZW SP* ÷
+ // Scan backwards from prevChar for SP* ZW
+ tPos = prevPos;
+ while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+ tPos = fText->moveIndex32(tPos, -1);
+ }
+ if (fZW->contains(fText->char32At(tPos))) {
break;
}
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
- if (fZJ->contains(prevC)) {
+ if (fZWJ->contains(prevC)) {
continue;
}
}
continue;
}
- // LB30a RI RI <break> RI
- // RI x RI
+ // LB30a RI RI ÷ RI
+ // RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+ // Two Regional Indicators have been paired.
+ // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+ // following RI. This is a hack.
+ thisChar = -1;
continue;
}
delete fXX;
delete fEB;
delete fEM;
- delete fZJ;
+ delete fZWJ;
delete fCharBI;
delete fNumberMatcher;
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
}
void optimizeTables() {
- // Begin looking for duplicates with char class 3.
- // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
- // and should not have other categories merged into them.
- IntPair duplPair = new IntPair(3, 0);
- while (fForwardTable.findDuplCharClassFrom(duplPair)) {
- fSetBuilder.mergeCategories(duplPair);
- fForwardTable.removeColumn(duplPair.second);
- }
- fForwardTable.removeDuplicateStates();
+ boolean didSomething;
+ do {
+ didSomething = false;
+ // Begin looking for duplicates with char class 3.
+ // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+ // and should not have other categories merged into them.
+ IntPair duplPair = new IntPair(3, 0);
+ while (fForwardTable.findDuplCharClassFrom(duplPair)) {
+ fSetBuilder.mergeCategories(duplPair);
+ fForwardTable.removeColumn(duplPair.second);
+ didSomething = true;
+ }
+ while (fForwardTable.removeDuplicateStates() > 0) {
+ didSomething = true;
+ };
+ } while (didSomething);
}
}
/**
* Check for, and remove duplicate states (table rows).
+ * @return the number of states removed.
* @internal
*/
- void removeDuplicateStates() {
+ int removeDuplicateStates() {
IntPair dupls = new IntPair(3, 0);
+ int numStatesRemoved = 0;
+
while (findDuplicateState(dupls)) {
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls);
+ ++numStatesRemoved;
}
+ return numStatesRemoved;
}
version https://git-lfs.github.com/spec/v1
-oid sha256:2cb8f12bbfbffe8a36d10f9d227668fb5468ccee6380b990d41cfa81e34ef2e0
-size 12508534
+oid sha256:70c249360d5cc010c75203f5add8040cbcc4f33229e1d82d34b6185d69832143
+size 12510210
version https://git-lfs.github.com/spec/v1
-oid sha256:c2fa72ee8523fcb52b31b81106e399e6caecb1e51167f84b31ba96670e15efac
+oid sha256:93a0bf4221a173b33aeda78f4646092caad816a6832310a89278de249ec18634
size 92857
int fOrigPositions;
+ // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not
+ // throw exceptions on out-of-range codePoints. This matches ICU4C behavior.
+ // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1
+ // to represent a non-codepoint that is not included in any of the property sets.
+ // This happens for rule 30a.
+
+ class XUnicodeSet extends UnicodeSet {
+ XUnicodeSet(String pattern) { super(pattern); }
+ @Override
+ public boolean contains(int codePoint) {
+ return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
+ false : super.contains(codePoint);
+ }
+ }
RBBILineMonkey()
{
fCharProperty = UProperty.LINE_BREAK;
fSets = new ArrayList();
- fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
- fCR = new UnicodeSet("[\\p{Line_break=CR}]");
- fLF = new UnicodeSet("[\\p{Line_break=LF}]");
- fCM = new UnicodeSet("[\\p{Line_break=CM}]");
- fNL = new UnicodeSet("[\\p{Line_break=NL}]");
- fSG = new UnicodeSet("[\\ud800-\\udfff]");
- fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
- fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
- fGL = new UnicodeSet("[\\p{Line_break=GL}]");
- fSP = new UnicodeSet("[\\p{Line_break=SP}]");
- fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
- fBA = new UnicodeSet("[\\p{Line_break=BA}]");
- fBB = new UnicodeSet("[\\p{Line_break=BB}]");
- fHY = new UnicodeSet("[\\p{Line_break=HY}]");
- fCB = new UnicodeSet("[\\p{Line_break=CB}]");
- fCL = new UnicodeSet("[\\p{Line_break=CL}]");
- fCP = new UnicodeSet("[\\p{Line_break=CP}]");
- fEX = new UnicodeSet("[\\p{Line_break=EX}]");
- fIN = new UnicodeSet("[\\p{Line_break=IN}]");
- fNS = new UnicodeSet("[\\p{Line_break=NS}]");
- fOP = new UnicodeSet("[\\p{Line_break=OP}]");
- fQU = new UnicodeSet("[\\p{Line_break=QU}]");
- fIS = new UnicodeSet("[\\p{Line_break=IS}]");
- fNU = new UnicodeSet("[\\p{Line_break=NU}]");
- fPO = new UnicodeSet("[\\p{Line_break=PO}]");
- fPR = new UnicodeSet("[\\p{Line_break=PR}]");
- fSY = new UnicodeSet("[\\p{Line_break=SY}]");
- fAI = new UnicodeSet("[\\p{Line_break=AI}]");
- fAL = new UnicodeSet("[\\p{Line_break=AL}]");
- fCJ = new UnicodeSet("[\\p{Line_break=CJ}]");
- fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
- fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
- fHL = new UnicodeSet("[\\p{Line_break=HL}]");
- fID = new UnicodeSet("[\\p{Line_break=ID}]");
- fJL = new UnicodeSet("[\\p{Line_break=JL}]");
- fJV = new UnicodeSet("[\\p{Line_break=JV}]");
- fJT = new UnicodeSet("[\\p{Line_break=JT}]");
- fRI = new UnicodeSet("[\\p{Line_break=RI}]");
- fXX = new UnicodeSet("[\\p{Line_break=XX}]");
- fEB = new UnicodeSet("[\\p{Line_break=EB}]");
- fEM = new UnicodeSet("[\\p{Line_break=EM}]");
- fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
+ fBK = new XUnicodeSet("[\\p{Line_Break=BK}]");
+ fCR = new XUnicodeSet("[\\p{Line_break=CR}]");
+ fLF = new XUnicodeSet("[\\p{Line_break=LF}]");
+ fCM = new XUnicodeSet("[\\p{Line_break=CM}]");
+ fNL = new XUnicodeSet("[\\p{Line_break=NL}]");
+ fSG = new XUnicodeSet("[\\ud800-\\udfff]");
+ fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]");
+ fZW = new XUnicodeSet("[\\p{Line_break=ZW}]");
+ fGL = new XUnicodeSet("[\\p{Line_break=GL}]");
+ fSP = new XUnicodeSet("[\\p{Line_break=SP}]");
+ fB2 = new XUnicodeSet("[\\p{Line_break=B2}]");
+ fBA = new XUnicodeSet("[\\p{Line_break=BA}]");
+ fBB = new XUnicodeSet("[\\p{Line_break=BB}]");
+ fHY = new XUnicodeSet("[\\p{Line_break=HY}]");
+ fCB = new XUnicodeSet("[\\p{Line_break=CB}]");
+ fCL = new XUnicodeSet("[\\p{Line_break=CL}]");
+ fCP = new XUnicodeSet("[\\p{Line_break=CP}]");
+ fEX = new XUnicodeSet("[\\p{Line_break=EX}]");
+ fIN = new XUnicodeSet("[\\p{Line_break=IN}]");
+ fNS = new XUnicodeSet("[\\p{Line_break=NS}]");
+ fOP = new XUnicodeSet("[\\p{Line_break=OP}]");
+ fQU = new XUnicodeSet("[\\p{Line_break=QU}]");
+ fIS = new XUnicodeSet("[\\p{Line_break=IS}]");
+ fNU = new XUnicodeSet("[\\p{Line_break=NU}]");
+ fPO = new XUnicodeSet("[\\p{Line_break=PO}]");
+ fPR = new XUnicodeSet("[\\p{Line_break=PR}]");
+ fSY = new XUnicodeSet("[\\p{Line_break=SY}]");
+ fAI = new XUnicodeSet("[\\p{Line_break=AI}]");
+ fAL = new XUnicodeSet("[\\p{Line_break=AL}]");
+ fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]");
+ fH2 = new XUnicodeSet("[\\p{Line_break=H2}]");
+ fH3 = new XUnicodeSet("[\\p{Line_break=H3}]");
+ fHL = new XUnicodeSet("[\\p{Line_break=HL}]");
+ fID = new XUnicodeSet("[\\p{Line_break=ID}]");
+ fJL = new XUnicodeSet("[\\p{Line_break=JL}]");
+ fJV = new XUnicodeSet("[\\p{Line_break=JV}]");
+ fJT = new XUnicodeSet("[\\p{Line_break=JT}]");
+ fRI = new XUnicodeSet("[\\p{Line_break=RI}]");
+ fXX = new XUnicodeSet("[\\p{Line_break=XX}]");
+ fEB = new XUnicodeSet("[\\p{Line_break=EB}]");
+ fEM = new XUnicodeSet("[\\p{Line_break=EM}]");
+ fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]");
// Remove dictionary characters.
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
}
// LB 8 Break after zero width space
- if (fZW.contains(prevChar)) {
+ // ZW SP* ÷
+ // Scan backwards from prevChar for SP* ZW
+ tPos = prevPos;
+ while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
+ tPos = moveIndex32(fText, tPos, -1);
+ }
+ if (fZW.contains(UTF16.charAt(fText, tPos))) {
break;
}
}
// LB 30a Break between pairs of Regional Indicators.
- // RI RI <break> RI
- // RI x RI
+ // RI RI ÷ RI
+ // RI x RI
if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
break;
}
if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
+ // Two Regional Indicators have been paired.
+ // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+ // following RI. This is a hack.
+ thisChar = -1;
continue;
}
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM_ = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM_ = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM_ = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM_ = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM_ = [:LineBreak = Combining_Mark:];
+CMS = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM_ ZWJ];
+CM = [CMS ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
+# LB8, break after ZW SP*, precedes LB7 because they will both match the sequences like ZW SP,
+# and LB8 should take precedence.
+
+LB8: ZW SP* ÷ [^ZW SP BK CR LF NL];
+
+# LB7 Do not break before spaces or zero width space.
+
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
-# LB8, ICU differs from UAX-14,
-# ICU: ZW ÷;
-# UAX 14: ZW SP* ÷;
-LB8: ZW ÷;
-
# LB8a
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
LB30.2: CP CM* (AL | HL | NU);
# LB30a keep pairs of RI together.
-LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
-LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier