From 6e40252eb3b16e98c9e76f9e10e8f13411ab176a Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 17 May 2018 18:07:43 +0000 Subject: [PATCH] ICU-13770 RBBI Line break rules update, work in progress. X-SVN-Rev: 41404 --- icu4c/source/data/brkitr/rules/line.txt | 10 +++++----- icu4c/source/test/testdata/break_rules/line.txt | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index ce6e149d2a0..cf945b46f57 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -7,12 +7,9 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 37 for Unicode 9.0 +# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # http://www.unicode.org/reports/tr14/ # -# Includes extensions to the handling of emoji ZWJ sequences from -# https://goo.gl/cluFCn -# # TODO: Rule LB 8 remains as it was in Unicode 5.2 # This is only because of a limitation of ICU break engine implementation, # not because the older behavior is desirable. @@ -321,7 +318,10 @@ $CP $CM* ($ALPlus | $HL | $NU); # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. $RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ]?; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 5ccc8b6febc..31d27b90dd4 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -5,7 +5,8 @@ # file: line.txt # -# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest +# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. +# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. -- 2.40.0