#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 3rd paragraph below.
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 3rd paragraph below.
#
-# Includes extensions to the handling of emoji ZWJ sequences from
-# https://goo.gl/cluFCn
-#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
-$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
-$Extended_Pict = [:ExtPict:];
-
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
+# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
-$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
-# Tricky interaction with LB8a: ZWJ x ID
-$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
+# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
+$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# file: line_loose.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_loose_cj.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_normal.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;
#
# file: line_normal_cj.txt
#
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
-EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
-Extended_Pict = [:ExtPict:];
-
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
LB8: ZW ÷;
# LB8a
-# ZWJ x (ID | Extended_Pict | EmojiNRK)
-LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
+# ZWJ x
+# Don't match a CM on the right - let other rules pick up CM sequences, where
+# the ZWJ behaves as just another generic CM.
+LB8a: ZWJ [^CM];
# LB9: X CM -> X
#LB11: × WJ;
# WJ ×
-LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
+LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
-# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
+# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
-# ZWJ acts independently to the right, no break from ID by LB8a.
-LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB20.1b: CB CM* ÷;
+# ZWJ acts independently to the right, no break after by LB8a.
+LB20.1: . CM* ZWJ CB;
+LB20.2: . CM* ÷ CB;
+
+LB20.3: CB CM* ZWJ [^CM];
+LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
-# LB31 keep pairs of RI together.
-LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
-LB30a.3: RI CM* RI CM* ÷;
+# LB30a keep pairs of RI together.
+LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
+LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
+LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;