ICU-13770 RBBI Line break rules update, work in progress.

author Andy Heninger <andy.heninger@gmail.com>

Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt

index ce6e149d2a0a8d0874328b50bcfa06a31defeaf0..cf945b46f579d902de41fbca392615933d8f165d 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -7,12 +7,9 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 37 for Unicode 9.0
+#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #         http://www.unicode.org/reports/tr14/
  #
-#         Includes extensions to the handling of emoji ZWJ sequences from
-#         https://goo.gl/cluFCn
-#
  #         TODO:  Rule LB 8 remains as it was in Unicode 5.2
  #         This is only because of a limitation of ICU break engine implementation,
  #         not because the older behavior is desirable.
@@ -321,7 +318,10 @@ $CP $CM* ($ALPlus | $HL | $NU);
  #         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
  $RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
  $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ]?;
+$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
+# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
+#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
+#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
  
  # LB 30b Do not break between an Emoji Base and an Emoji Modifier
  $EB $CM* $EM;
diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt

index 5ccc8b6febc2970ecbbbb8d7a19538cf969aa5de..31d27b90dd4531ec586c050ca1c41be35e017e27 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line.txt
+++ b/icu4c/source/test/testdata/break_rules/line.txt
@@ -5,7 +5,8 @@
  
  # file: line.txt
  #
-# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
+# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
+# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
  #
  # Note: Rule syntax and the monkey test itself are still a work in progress.
  #       They are expected to change with review and the addition of support for rule tailoring.
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Thu, 17 May 2018 18:07:43 +0000 (18:07 +0000)
icu4c/source/data/brkitr/rules/line.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line.txt		patch \| blob \| history