version https://git-lfs.github.com/spec/v1
-oid sha256:4bd87b532fc7ad362740dde413999961c7f372cbbce5fb54160d201b783fec33
-size 12503004
+oid sha256:e9ffd3c1d1fa55ec8819eee15483f7f1b4c4520a62a9ae3d0b3f971b9d06e18c
+size 12500142
version https://git-lfs.github.com/spec/v1
-oid sha256:5c25f29e8e9f5b7244a63ddc48dd7d69f56612b310e1de8351c9ea80a84afc6f
+oid sha256:88f00fc2ffbd0fcae8531cffdcc5b405876d3d89036e5cb8e077b0e817b88d9f
size 92867
UnicodeSet fEB;
UnicodeSet fEM;
UnicodeSet fZWJ;
- UnicodeSet fExtendedPict;
- UnicodeSet fEmojiNRK;
StringBuffer fText;
int fOrigPositions;
fEB = new UnicodeSet("[\\p{Line_break=EB}]");
fEM = new UnicodeSet("[\\p{Line_break=EM}]");
fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
- fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
- fExtendedPict = new UnicodeSet("[:Extended_Pictographic:]");
-
// Remove dictionary characters.
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
fSets.add(fEB);
fSets.add(fEM);
fSets.add(fZWJ);
- fSets.add(fExtendedPict);
- fSets.add(fEmojiNRK);
}
@Override
break;
}
+ // LB 25 Numbers
+ // Move this test up, before LB8a, because numbers can match a longer sequence that would
+ // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
+ matchVals = LBNumberCheck(fText, prevPos, matchVals);
+ if (matchVals[0] != -1) {
+ // Matched a number. But could have been just a single digit, which would
+ // not represent a "no break here" between prevChar and thisChar
+ int numEndIdx = matchVals[1]; // idx of first char following num
+ if (numEndIdx > pos) {
+ // Number match includes at least the two chars being checked
+ if (numEndIdx > nextPos) {
+ // Number match includes additional chars. Update pos and nextPos
+ // so that next loop iteration will continue at the end of the number,
+ // checking for breaks between last char in number & whatever follows.
+ nextPos = numEndIdx;
+ pos = numEndIdx;
+ do {
+ pos = moveIndex32(fText, pos, -1);
+ thisChar = UTF16.charAt(fText, pos);
+ }
+ while (fCM.contains(thisChar));
+ }
+ continue;
+ }
+ }
+
// LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji)
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZWJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZWJ.
{
int prevC = fText.codePointBefore(pos);
- if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
+ if (fZWJ.contains(prevC)) {
continue;
}
}
continue;
}
-
- // LB 25 Numbers
- matchVals = LBNumberCheck(fText, prevPos, matchVals);
- if (matchVals[0] != -1) {
- // Matched a number. But could have been just a single digit, which would
- // not represent a "no break here" between prevChar and thisChar
- int numEndIdx = matchVals[1]; // idx of first char following num
- if (numEndIdx > pos) {
- // Number match includes at least the two chars being checked
- if (numEndIdx > nextPos) {
- // Number match includes additional chars. Update pos and nextPos
- // so that next loop iteration will continue at the end of the number,
- // checking for breaks between last char in number & whatever follows.
- nextPos = numEndIdx;
- pos = numEndIdx;
- do {
- pos = moveIndex32(fText, pos, -1);
- thisChar = UTF16.charAt(fText, pos);
- }
- while (fCM.contains(thisChar));
- }
- continue;
- }
- }
-
+ // LB 25 Numbers match, moved up, before LB 8a.
// LB 26 Do not break Korean Syllables
if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
# file: line.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_loose.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_loose_cj.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_normal.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_normal_cj.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
-CM = [:LineBreak = Combining_Mark:];
+CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
-CM = [CM ZWJ];
+CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;