From: Andy Heninger <andy.heninger@gmail.com>
Date: Thu, 7 Feb 2019 02:01:04 +0000 (-0800)
Subject: ICU-20401 rbbi break rules, update comments to match current UAX versions.
X-Git-Tag: release-64-rc~107
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b50f97a58a61e9a85c7d0ad522c68590ba1fedec;p=icu

ICU-20401 rbbi break rules, update comments to match current UAX versions.
---

diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt
index 21c6852470b..973207a6e45 100644
--- a/icu4c/source/data/brkitr/rules/char.txt
+++ b/icu4c/source/data/brkitr/rules/char.txt
@@ -6,11 +6,9 @@
 #
 #   file:  char.txt
 #
-#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-#      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#   ICU Character Break Rules
+#      These rules are based on the Extended Grapheme Cluster rules from
+#      Unicode UAX #29 Revision 34 for Unicode Version 12.0
 
 !!quoted_literals_only;
 
@@ -20,9 +18,6 @@
 $CR          = [\p{Grapheme_Cluster_Break = CR}];
 $LF          = [\p{Grapheme_Cluster_Break = LF}];
 $Control     = [[\p{Grapheme_Cluster_Break = Control}]];
-# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
-#$Virama      = [[\p{Grapheme_Cluster_Break = Virama}]];
-#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
 $Extend      = [[\p{Grapheme_Cluster_Break = Extend}]];
 $ZWJ         = [\p{Grapheme_Cluster_Break = ZWJ}];
 $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
@@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT);
 # GB 9
 [^$Control $CR $LF] ($Extend | $ZWJ);
 
-# GB 9a (only for extended grapheme clusters)
+# GB 9a
 [^$Control $CR $LF] $SpacingMark;
 
 # GB 9b
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt
index cca19772dba..17f648afdec 100644
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt
index b59059e04dd..e120bf45c74 100644
--- a/icu4c/source/data/brkitr/rules/line_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_cj.txt
@@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt
index 1d46d625c67..bf36b205969 100644
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -8,7 +8,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
index b70538f7c05..ec60fed90c2 100644
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt
index b753a4c0011..e280c32186a 100644
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
index ebaf3aee0a0..aee65ddb8f7 100644
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/sent.txt b/icu4c/source/data/brkitr/rules/sent.txt
index 41fd3fcff90..eb1224ea5ee 100644
--- a/icu4c/source/data/brkitr/rules/sent.txt
+++ b/icu4c/source/data/brkitr/rules/sent.txt
@@ -8,7 +8,7 @@
 #
 #   ICU Sentence Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 
 !!quoted_literals_only;
@@ -34,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
 #
 # Define extended forms of the character classes,
 #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.
 
 $SpEx       = $Sp      ($Extend | $Format)*;
 $LowerEx    = $Lower   ($Extend | $Format)*;
@@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
 #Rule 9, 10, 11
 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
 
-#Rule 12
+#Rule 998
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
diff --git a/icu4c/source/data/brkitr/rules/sent_el.txt b/icu4c/source/data/brkitr/rules/sent_el.txt
index 07969825124..632887f74bb 100644
--- a/icu4c/source/data/brkitr/rules/sent_el.txt
+++ b/icu4c/source/data/brkitr/rules/sent_el.txt
@@ -1,7 +1,6 @@
 # Copyright (C) 2016 and later: Unicode, Inc. and others.
 # License & terms of use: http://www.unicode.org/copyright.html
 #
-#
 #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
@@ -9,7 +8,7 @@
 #
 #   ICU Sentence Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 
 !!quoted_literals_only;
@@ -35,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
 #
 # Define extended forms of the character classes,
 #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.
 
 $SpEx       = $Sp      ($Extend | $Format)*;
 $LowerEx    = $Lower   ($Extend | $Format)*;
@@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
 #Rule 9, 10, 11
 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
 
-#Rule 12
+#Rule 998
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt
index 3027574d25d..d6fb5adc133 100644
--- a/icu4c/source/data/brkitr/rules/word.txt
+++ b/icu4c/source/data/brkitr/rules/word.txt
@@ -8,9 +8,7 @@
 #
 # ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@@ -58,7 +56,7 @@ $Hiragana           = [:Hiragana:];
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.
 
-$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
 $HangulSyllable = [\uac00-\ud7a3];
 $ComplexContext = [:LineBreak = Complex_Context:];
 $KanaKanji      = [$Han $Hiragana $Katakana];
@@ -70,7 +68,7 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
 
 #
-#  Rules 4    Ignore Format and Extend characters, 
+#  Rules 4    Ignore Format and Extend characters,
 #             except when they appear at the beginning of a region of text.
 #
 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
@@ -148,7 +146,7 @@ $NumericEx $NumericEx {100};
 
 $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
 
-# rule 11 and 12 
+# rule 11 and 12
 
 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
 
@@ -180,7 +178,7 @@ $ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)
 
 # special handling for CJK characters: chain for later dictionary segmentation
 $HangulSyllable $HangulSyllable {200};
-$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
 
 # Rule 999
 #     Match a single code point if no other rule applies.
diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt
index bcf127a42aa..bf6013951d4 100644
--- a/icu4c/source/data/brkitr/rules/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt
@@ -8,9 +8,7 @@
 #
 # ICU Word Break Rules, POSIX locale.
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.