From: Andy Heninger Date: Thu, 7 Feb 2019 02:01:04 +0000 (-0800) Subject: ICU-20401 rbbi break rules, update comments to match current UAX versions. X-Git-Tag: release-64-rc~107 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b50f97a58a61e9a85c7d0ad522c68590ba1fedec;p=icu ICU-20401 rbbi break rules, update comments to match current UAX versions. --- diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index 21c6852470b..973207a6e45 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -6,11 +6,9 @@ # # file: char.txt # -# ICU Character Break Rules, also known as Grapheme Cluster Boundaries -# See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 -# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088 -# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html +# ICU Character Break Rules +# These rules are based on the Extended Grapheme Cluster rules from +# Unicode UAX #29 Revision 34 for Unicode Version 12.0 !!quoted_literals_only; @@ -20,9 +18,6 @@ $CR = [\p{Grapheme_Cluster_Break = CR}]; $LF = [\p{Grapheme_Cluster_Break = LF}]; $Control = [[\p{Grapheme_Cluster_Break = Control}]]; -# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets. -#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]]; -#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]]; $Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; $ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; @@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT); # GB 9 [^$Control $CR $LF] ($Extend | $ZWJ); -# GB 9a (only for extended grapheme clusters) +# GB 9a [^$Control $CR $LF] $SpacingMark; # GB 9b diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index cca19772dba..17f648afdec 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt index b59059e04dd..e120bf45c74 100644 --- a/icu4c/source/data/brkitr/rules/line_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 1d46d625c67..bf36b205969 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -8,7 +8,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index b70538f7c05..ec60fed90c2 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index b753a4c0011..e280c32186a 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index ebaf3aee0a0..aee65ddb8f7 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when diff --git a/icu4c/source/data/brkitr/rules/sent.txt b/icu4c/source/data/brkitr/rules/sent.txt index 41fd3fcff90..eb1224ea5ee 100644 --- a/icu4c/source/data/brkitr/rules/sent.txt +++ b/icu4c/source/data/brkitr/rules/sent.txt @@ -8,7 +8,7 @@ # # ICU Sentence Break Rules # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # !!quoted_literals_only; @@ -34,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}]; # # Define extended forms of the character classes, # incorporate trailing Extend or Format chars. -# Rules 4 and 5. +# Rules 4 and 5. $SpEx = $Sp ($Extend | $Format)*; $LowerEx = $Lower ($Extend | $Format)*; @@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; #Rule 9, 10, 11 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; -#Rule 12 +#Rule 998 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; diff --git a/icu4c/source/data/brkitr/rules/sent_el.txt b/icu4c/source/data/brkitr/rules/sent_el.txt index 07969825124..632887f74bb 100644 --- a/icu4c/source/data/brkitr/rules/sent_el.txt +++ b/icu4c/source/data/brkitr/rules/sent_el.txt @@ -1,7 +1,6 @@ # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # -# # Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. # @@ -9,7 +8,7 @@ # # ICU Sentence Break Rules # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # !!quoted_literals_only; @@ -35,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}]; # # Define extended forms of the character classes, # incorporate trailing Extend or Format chars. -# Rules 4 and 5. +# Rules 4 and 5. $SpEx = $Sp ($Extend | $Format)*; $LowerEx = $Lower ($Extend | $Format)*; @@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; #Rule 9, 10, 11 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; -#Rule 12 +#Rule 998 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index 3027574d25d..d6fb5adc133 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -8,9 +8,7 @@ # # ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 -# with additions for Emoji Sequences from https://goo.gl/cluFCn -# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also. @@ -58,7 +56,7 @@ $Hiragana = [:Hiragana:]; # 5.0 or later as the definition of Complex_Context was corrected to include all # characters requiring dictionary break. -$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; $KanaKanji = [$Han $Hiragana $Katakana]; @@ -70,7 +68,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; # -# Rules 4 Ignore Format and Extend characters, +# Rules 4 Ignore Format and Extend characters, # except when they appear at the beginning of a region of text. # # TODO: check if handling of katakana in dictionary makes rules incorrect/void @@ -148,7 +146,7 @@ $NumericEx $NumericEx {100}; $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; -# rule 11 and 12 +# rule 11 and 12 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; @@ -180,7 +178,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b) # special handling for CJK characters: chain for later dictionary segmentation $HangulSyllable $HangulSyllable {200}; -$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found # Rule 999 # Match a single code point if no other rule applies. diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index bcf127a42aa..bf6013951d4 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -8,9 +8,7 @@ # # ICU Word Break Rules, POSIX locale. # See Unicode Standard Annex #29. -# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 -# with additions for Emoji Sequences from https://goo.gl/cluFCn -# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also.