#
# file: char.txt
#
-# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
-# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+# ICU Character Break Rules
+# These rules are based on the Extended Grapheme Cluster rules from
+# Unicode UAX #29 Revision 34 for Unicode Version 12.0
!!quoted_literals_only;
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [[\p{Grapheme_Cluster_Break = Control}]];
-# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
-#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]];
-#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
$Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
$ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
# GB 9
[^$Control $CR $LF] ($Extend | $ZWJ);
-# GB 9a (only for extended grapheme clusters)
+# GB 9a
[^$Control $CR $LF] $SpacingMark;
# GB 9b
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
!!quoted_literals_only;
#
# Define extended forms of the character classes,
# incorporate trailing Extend or Format chars.
-# Rules 4 and 5.
+# Rules 4 and 5.
$SpEx = $Sp ($Extend | $Format)*;
$LowerEx = $Lower ($Extend | $Format)*;
#Rule 9, 10, 11
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
-#Rule 12
+#Rule 998
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
-#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
!!quoted_literals_only;
#
# Define extended forms of the character classes,
# incorporate trailing Extend or Format chars.
-# Rules 4 and 5.
+# Rules 4 and 5.
$SpEx = $Sp ($Extend | $Format)*;
$LowerEx = $Lower ($Extend | $Format)*;
#Rule 9, 10, 11
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
-#Rule 12
+#Rule 998
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-# with additions for Emoji Sequences from https://goo.gl/cluFCn
-# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
-$Control = [\p{Grapheme_Cluster_Break = Control}];
+$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
#
-# Rules 4 Ignore Format and Extend characters,
+# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
-# rule 11 and 12
+# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
-$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
# Rule 999
# Match a single code point if no other rule applies.
#
# ICU Word Break Rules, POSIX locale.
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-# with additions for Emoji Sequences from https://goo.gl/cluFCn
-# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.