From 50fc9ce8784a5fc79fa844b5588d10348b86ef52 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 18 May 2018 17:47:03 +0000 Subject: [PATCH] ICU-13770 Line Break Rules for Unicode 11, tailorings updated. X-SVN-Rev: 41410 --- icu4c/source/data/brkitr/rules/line_fi.txt | 26 ++++++-------- icu4c/source/data/brkitr/rules/line_loose.txt | 26 ++++++-------- .../data/brkitr/rules/line_loose_cj.txt | 26 ++++++-------- .../data/brkitr/rules/line_loose_fi.txt | 26 ++++++-------- .../source/data/brkitr/rules/line_normal.txt | 26 ++++++-------- .../data/brkitr/rules/line_normal_cj.txt | 26 ++++++-------- .../data/brkitr/rules/line_normal_fi.txt | 26 ++++++-------- .../test/testdata/break_rules/line_loose.txt | 36 ++++++++++--------- .../testdata/break_rules/line_loose_cj.txt | 36 ++++++++++--------- .../test/testdata/break_rules/line_normal.txt | 36 ++++++++++--------- .../testdata/break_rules/line_normal_cj.txt | 36 ++++++++++--------- 11 files changed, 146 insertions(+), 180 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt index 82d8cc202db..92eefffd91b 100644 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_fi.txt @@ -7,13 +7,10 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # tailored as noted in 2nd paragraph below. # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -78,9 +75,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -151,9 +145,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -330,13 +324,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 2e937808b78..5e8f7a16703 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -8,12 +8,9 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 @@ -81,9 +78,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -154,9 +148,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -333,13 +327,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 1f9dbcac85e..651924120a9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -7,12 +7,9 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 @@ -91,9 +88,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -164,9 +158,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -347,13 +341,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt index f1161d1a6ef..3775b7f9ac9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt @@ -7,13 +7,10 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # tailored as noted in 3rd paragraph below. # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -77,9 +74,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -150,9 +144,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -332,13 +326,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index 708082dd261..5bfbe8e8d9e 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -7,12 +7,9 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 @@ -76,9 +73,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -149,9 +143,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -325,13 +319,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 58aa2fc9d80..4576b205e15 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -7,12 +7,9 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # tailored as noted in 2nd paragraph below. # # TODO: Rule LB 8 remains as it was in Unicode 5.2 @@ -79,9 +76,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -152,9 +146,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -331,13 +325,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt index d6f36bb1e2e..b5efc152aea 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt @@ -7,13 +7,10 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # tailored as noted in 3rd paragraph below. # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -76,9 +73,6 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; -$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]]; -$Extended_Pict = [:ExtPict:]; - # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -149,9 +143,9 @@ $CAN_CM $CM* [$SP $ZW]; $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences. +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. # -$ZWJ ($ID | $Extended_Pict | $EmojiNRK); +$ZWJ [^$CM]; # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL # $CM not covered by the above needs to behave like $AL @@ -328,13 +322,13 @@ $IS $CM* ($ALPlus | $HL); $CP $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x ID -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; -$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; - -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}]; -$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 45de73b5eb9..00552fec7c0 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -5,7 +5,8 @@ # # file: line_loose.txt # -# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. +# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -67,9 +68,6 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; -EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]]; -Extended_Pict = [:ExtPict:]; - # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -105,8 +103,10 @@ LB7.2: [ZW SP] [SP ZW]; LB8: ZW ÷; # LB8a -# ZWJ x (ID | Extended_Pict | EmojiNRK) -LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); +# ZWJ x +# Don't match a CM on the right - let other rules pick up CM sequences, where +# the ZWJ behaves as just another generic CM. +LB8a: ZWJ [^CM]; # LB9: X CM -> X @@ -115,7 +115,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); #LB11: × WJ; # WJ × -LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; @@ -141,12 +141,14 @@ LB19: . CM* QU; LB19.1: QU CM* [^CM]; # LB 20 Break before and after CB. -# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ. +# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ. # ZWJ acts like a CM to the left, combining with CB. -# ZWJ acts independently to the right, no break from ID by LB8a. -LB20: . CM* ÷ CB; -LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB20.1b: CB CM* ÷; +# ZWJ acts independently to the right, no break after by LB8a. +LB20.1: . CM* ZWJ CB; +LB20.2: . CM* ÷ CB; + +LB20.3: CB CM* ZWJ [^CM]; +LB20.4: CB CM* ÷; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. @@ -193,15 +195,15 @@ LB29: IS CM* (AL | HL); LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); -# LB31 keep pairs of RI together. -LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; -LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB30a.3: RI CM* RI CM* ÷; +# LB30a keep pairs of RI together. +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; # LB31 Break Everywhere Else. # Include combining marks -LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK); +LB31.1: . CM* ZWJ [^CM]; LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 63a244e22cd..59c21fd3d9a 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -5,7 +5,8 @@ # # file: line_loose_cj.txt # -# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. +# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -84,9 +85,6 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; -EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]]; -Extended_Pict = [:ExtPict:]; - # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -122,8 +120,10 @@ LB7.2: [ZW SP] [SP ZW]; LB8: ZW ÷; # LB8a -# ZWJ x (ID | Extended_Pict | EmojiNRK) -LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); +# ZWJ x +# Don't match a CM on the right - let other rules pick up CM sequences, where +# the ZWJ behaves as just another generic CM. +LB8a: ZWJ [^CM]; # LB9: X CM -> X @@ -132,7 +132,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); #LB11: × WJ; # WJ × -LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; @@ -158,12 +158,14 @@ LB19: . CM* QU; LB19.1: QU CM* [^CM]; # LB 20 Break before and after CB. -# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ. +# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ. # ZWJ acts like a CM to the left, combining with CB. -# ZWJ acts independently to the right, no break from ID by LB8a. -LB20: . CM* ÷ CB; -LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB20.1b: CB CM* ÷; +# ZWJ acts independently to the right, no break after by LB8a. +LB20.1: . CM* ZWJ CB; +LB20.2: . CM* ÷ CB; + +LB20.3: CB CM* ZWJ [^CM]; +LB20.4: CB CM* ÷; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. @@ -214,15 +216,15 @@ LB29: IS CM* (AL | HL); LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); -# LB31 keep pairs of RI together. -LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; -LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB30a.3: RI CM* RI CM* ÷; +# LB30a keep pairs of RI together. +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; # LB31 Break Everywhere Else. # Include combining marks -LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK); +LB31.1: . CM* ZWJ [^CM]; LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 3fa5e235ba0..7cf2613a855 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -5,7 +5,8 @@ # # file: line_normal.txt # -# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. +# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -73,9 +74,6 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; -EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]]; -Extended_Pict = [:ExtPict:]; - # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -111,8 +109,10 @@ LB7.2: [ZW SP] [SP ZW]; LB8: ZW ÷; # LB8a -# ZWJ x (ID | Extended_Pict | EmojiNRK) -LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); +# ZWJ x +# Don't match a CM on the right - let other rules pick up CM sequences, where +# the ZWJ behaves as just another generic CM. +LB8a: ZWJ [^CM]; # LB9: X CM -> X @@ -121,7 +121,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); #LB11: × WJ; # WJ × -LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; @@ -147,12 +147,14 @@ LB19: . CM* QU; LB19.1: QU CM* [^CM]; # LB 20 Break before and after CB. -# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ. +# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ. # ZWJ acts like a CM to the left, combining with CB. -# ZWJ acts independently to the right, no break from ID by LB8a. -LB20: . CM* ÷ CB; -LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB20.1b: CB CM* ÷; +# ZWJ acts independently to the right, no break after by LB8a. +LB20.1: . CM* ZWJ CB; +LB20.2: . CM* ÷ CB; + +LB20.3: CB CM* ZWJ [^CM]; +LB20.4: CB CM* ÷; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. @@ -199,15 +201,15 @@ LB29: IS CM* (AL | HL); LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); -# LB31 keep pairs of RI together. -LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; -LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB30a.3: RI CM* RI CM* ÷; +# LB30a keep pairs of RI together. +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; # LB31 Break Everywhere Else. # Include combining marks -LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK); +LB31.1: . CM* ZWJ [^CM]; LB31.2: . CM* ÷; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index 5a47d8ea7c1..3af4a582db1 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -5,7 +5,8 @@ # # file: line_normal_cj.txt # -# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. +# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -75,9 +76,6 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; -EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]]; -Extended_Pict = [:ExtPict:]; - # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -116,8 +114,10 @@ LB7.2: [ZW SP] [SP ZW]; LB8: ZW ÷; # LB8a -# ZWJ x (ID | Extended_Pict | EmojiNRK) -LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); +# ZWJ x +# Don't match a CM on the right - let other rules pick up CM sequences, where +# the ZWJ behaves as just another generic CM. +LB8a: ZWJ [^CM]; # LB9: X CM -> X @@ -126,7 +126,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK); #LB11: × WJ; # WJ × -LB11.1: [^BK CR LF NL SP ZW] CM* WJ; +LB11.1: [^SP] CM* WJ; LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; @@ -152,12 +152,14 @@ LB19: . CM* QU; LB19.1: QU CM* [^CM]; # LB 20 Break before and after CB. -# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ. +# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ. # ZWJ acts like a CM to the left, combining with CB. -# ZWJ acts independently to the right, no break from ID by LB8a. -LB20: . CM* ÷ CB; -LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB20.1b: CB CM* ÷; +# ZWJ acts independently to the right, no break after by LB8a. +LB20.1: . CM* ZWJ CB; +LB20.2: . CM* ÷ CB; + +LB20.3: CB CM* ZWJ [^CM]; +LB20.4: CB CM* ÷; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. @@ -208,15 +210,15 @@ LB29: IS CM* (AL | HL); LB30.1: (AL | CM | HL | NU) CM* OP; LB30.2: CP CM* (AL | HL | NU); -# LB31 keep pairs of RI together. -LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS]; -LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK); -LB30a.3: RI CM* RI CM* ÷; +# LB30a keep pairs of RI together. +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; # LB31 Break Everywhere Else. # Include combining marks -LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK); +LB31.1: . CM* ZWJ [^CM]; LB31.2: . CM* ÷; -- 2.40.0