ICU-20401 rbbi break rules, update comments to match current UAX versions.

author Andy Heninger <andy.heninger@gmail.com>

Thu, 7 Feb 2019 02:01:04 +0000 (18:01 -0800)

committer Andy Heninger <andy.heninger@gmail.com>

Fri, 8 Feb 2019 20:53:58 +0000 (12:53 -0800)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 7 Feb 2019 02:01:04 +0000 (18:01 -0800)
committer Andy Heninger <andy.heninger@gmail.com>
Fri, 8 Feb 2019 20:53:58 +0000 (12:53 -0800)
diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt

index 21c6852470b9850fef1abaf55bbcddd49f8a1f2d..973207a6e450d73b1610a4d6389cd34c1a772e31 100644 (file)
--- a/icu4c/source/data/brkitr/rules/char.txt
+++ b/icu4c/source/data/brkitr/rules/char.txt
@@ -6,11 +6,9 @@
  #
  #   file:  char.txt
  #
-#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-#      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#   ICU Character Break Rules
+#      These rules are based on the Extended Grapheme Cluster rules from
+#      Unicode UAX #29 Revision 34 for Unicode Version 12.0
  
  !!quoted_literals_only;
  
@@ -20,9 +18,6 @@
  $CR          = [\p{Grapheme_Cluster_Break = CR}];
  $LF          = [\p{Grapheme_Cluster_Break = LF}];
  $Control     = [[\p{Grapheme_Cluster_Break = Control}]];
-# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
-#$Virama      = [[\p{Grapheme_Cluster_Break = Virama}]];
-#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
  $Extend      = [[\p{Grapheme_Cluster_Break = Extend}]];
  $ZWJ         = [\p{Grapheme_Cluster_Break = ZWJ}];
  $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
@@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT);
  # GB 9
  [^$Control $CR $LF] ($Extend | $ZWJ);
  
-# GB 9a (only for extended grapheme clusters)
+# GB 9a
  [^$Control $CR $LF] $SpacingMark;
  
  # GB 9b
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt

index cca19772dbaace3cd394d414e38a60704dafc057..17f648afdec23d65292dd6c267f160e3ddbe5842 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -7,7 +7,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt

index b59059e04dd7f3a54521d6ecb8435ad4b19a17a9..e120bf45c746f4a4eb84cdf9524f25441620afdd 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_cj.txt
@@ -7,7 +7,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt

index 1d46d625c6778e4600daeeb3178cdf9530844eae..bf36b205969a324e6c5d0f00595023ba73f57487 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -8,7 +8,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt

index b70538f7c0570f442ef1029be40456d63d0c1643..ec60fed90c2a4c2f5bcbaeb4dddf91ec596fd922 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -7,7 +7,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt

index b753a4c00119da273f53d33399b675f4ba8874dc..e280c32186a2527191532bad3eea1ba0c5552be8 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -7,7 +7,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt

index ebaf3aee0a01d35890cebbd302d7f13dab072adf..aee65ddb8f7c6e78f480ee5c39fb8d55fe0add5b 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -7,7 +7,7 @@
  #
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
  #         http://www.unicode.org/reports/tr14/, with the following modification:
  #
  #         Boundaries between hyphens and following letters are suppressed when
diff --git a/icu4c/source/data/brkitr/rules/sent.txt b/icu4c/source/data/brkitr/rules/sent.txt

index 41fd3fcff906254b88f6c374c52ddcf3e70d9577..eb1224ea5ee6c1fcdcb47f8f6a401bb34a2f0be4 100644 (file)
--- a/icu4c/source/data/brkitr/rules/sent.txt
+++ b/icu4c/source/data/brkitr/rules/sent.txt
@@ -8,7 +8,7 @@
  #
  #   ICU Sentence Break Rules
  #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
  #
  
  !!quoted_literals_only;
@@ -34,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
  #
  # Define extended forms of the character classes,
  #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.
  
  $SpEx       = $Sp      ($Extend | $Format)*;
  $LowerEx    = $Lower   ($Extend | $Format)*;
@@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
  #Rule 9, 10, 11
  ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
  
-#Rule 12
+#Rule 998
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
diff --git a/icu4c/source/data/brkitr/rules/sent_el.txt b/icu4c/source/data/brkitr/rules/sent_el.txt

index 079698251241c5215e2944441c3810dc69ca13e9..632887f74bbf9f6a38f851ad51e2ae2836d2c3cd 100644 (file)
--- a/icu4c/source/data/brkitr/rules/sent_el.txt
+++ b/icu4c/source/data/brkitr/rules/sent_el.txt
@@ -1,7 +1,6 @@
  # Copyright (C) 2016 and later: Unicode, Inc. and others.
  # License & terms of use: http://www.unicode.org/copyright.html
  #
-#
  #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
  #       All Rights Reserved.
  #
@@ -9,7 +8,7 @@
  #
  #   ICU Sentence Break Rules
  #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
  #
  
  !!quoted_literals_only;
@@ -35,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
  #
  # Define extended forms of the character classes,
  #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.
  
  $SpEx       = $Sp      ($Extend | $Format)*;
  $LowerEx    = $Lower   ($Extend | $Format)*;
@@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
  #Rule 9, 10, 11
  ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
  
-#Rule 12
+#Rule 998
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
  [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt

index 3027574d25d5602ea73e05001e2106055e7e8b76..d6fb5adc13387977e11927b323f91655cd37362e 100644 (file)
--- a/icu4c/source/data/brkitr/rules/word.txt
+++ b/icu4c/source/data/brkitr/rules/word.txt
@@ -8,9 +8,7 @@
  #
  # ICU Word Break Rules
  #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
  #
  # Note:  Updates to word.txt will usually need to be merged into
  #        word_POSIX.txt also.
@@ -58,7 +56,7 @@ $Hiragana           = [:Hiragana:];
  #   5.0 or later as the definition of Complex_Context was corrected to include all
  #   characters requiring dictionary break.
  
-$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
  $HangulSyllable = [\uac00-\ud7a3];
  $ComplexContext = [:LineBreak = Complex_Context:];
  $KanaKanji      = [$Han $Hiragana $Katakana];
@@ -70,7 +68,7 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
  
  
  #
-#  Rules 4    Ignore Format and Extend characters, 
+#  Rules 4    Ignore Format and Extend characters,
  #             except when they appear at the beginning of a region of text.
  #
  # TODO: check if handling of katakana in dictionary makes rules incorrect/void
@@ -148,7 +146,7 @@ $NumericEx $NumericEx {100};
  
  $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
  
-# rule 11 and 12 
+# rule 11 and 12
  
  $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
  
@@ -180,7 +178,7 @@ $ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)
  
  # special handling for CJK characters: chain for later dictionary segmentation
  $HangulSyllable $HangulSyllable {200};
-$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
  
  # Rule 999
  #     Match a single code point if no other rule applies.
diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt

index bcf127a42aa7af7ded502e69f0d6127215305377..bf6013951d45b26e4e1ccf6f541fa3edfdb8bc7b 100644 (file)
--- a/icu4c/source/data/brkitr/rules/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt
@@ -8,9 +8,7 @@
  #
  # ICU Word Break Rules, POSIX locale.
  #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
  #
  # Note:  Updates to word.txt will usually need to be merged into
  #        word_POSIX.txt also.
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 7 Feb 2019 02:01:04 +0000 (18:01 -0800)
committer	Andy Heninger <andy.heninger@gmail.com>
	Fri, 8 Feb 2019 20:53:58 +0000 (12:53 -0800)
icu4c/source/data/brkitr/rules/char.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/sent.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/sent_el.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/word.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/word_POSIX.txt		patch \| blob \| history