#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 28 (Draft 7) for Unicode Version 9.0
-#
+# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
+# plus proposed updates for Emoji 4.0 from https://goo.gl/cluFCn
#
# Character Class Definitions.
# Emoji defintions
-$E_Base = [\p{Grapheme_Cluster_Break = EB}];
+$E_Base = [[\p{Grapheme_Cluster_Break = EB}] \U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
-$GAZ = [\p{Grapheme_Cluster_Break = GAZ}];
+
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
+$EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\u00230-9©®™〰〽]];
## -------------------------------------------------
!!chain;
($E_Base | $E_Base_GAZ) $Extend* $E_Modifier;
# GB 11
-$ZWJ ($GAZ | $E_Base_GAZ);
+($Extended_Pict | $EmojiNRK) $ZWJ ($Extended_Pict | $EmojiNRK);
# GB 12-13. Keep pairs of regional indicators together
# Note that hard break '/' rule triggers only if there are three or more initial RIs,
$E_Modifier $Extend* ($E_Base | $E_Base_GAZ);
# GB 11 Don't break between ZWJ and Glue_After_ZWJ
-($GAZ | $E_Base_GAZ) $ZWJ;
+($Extended_Pict | $EmojiNRK) $ZWJ ($Extended_Pict | $EmojiNRK);
# GB 12-13. Going backwards, we must scan through any number of regional indicators as pairs.
#
!!safe_reverse;
$Regional_Indicator $Regional_Indicator;
-($Extend | $ZWJ)+ .;
+($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
## -------------------------------------------------
!!safe_forward;
$Regional_Indicator $Regional_Indicator;
-($Extend | $ZWJ)+ .;
+($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EXX = [\uFF01 \uFF1F];
$EX = [[:LineBreak = Exclamation:] - $EXX];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below.
+# tailored as noted in 3rd paragraph below.
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
-#
-# tailored as noted in 2nd paragraph below.
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# Line Breaking Rules
# Implement default line breaking as defined by
-# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
+# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# http://www.unicode.org/reports/tr14/
-# tailored as noted in 2nd paragraph below.
+# tailored as noted in 3rd paragraph below.
#
-# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
-# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
-#
-# tailored as noted in 2nd paragraph below.
+# Includes extensions to the handling of emoji ZWJ sequences from
+# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# $CM = [:LineBreak = Combining_Mark:];
$CP = [:LineBreak = Close_Parenthesis:];
$CR = [:LineBreak = Carriage_Return:];
-$EB = [:LineBreak = EB:];
+$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$EM = [:LineBreak = EM:];
$EX = [:LineBreak = Exclamation:];
$GL = [:LineBreak = Glue:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
+$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
#
-$ZWJ ($ID | $EB | $EM);
+$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
-$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
+$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
-$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
+$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
# Requires an engine enhancement.
# / $SP* $ZW
-# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
-# The ZWJ will look like a CM to whatever precedes it.
+# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
-($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
+($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 28 (draft 7) for Unicode Version 9.0
+# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
+# with additions for Emoji Sequences from https://goo.gl/cluFCn
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-$E_Base = [\p{Word_Break = EB}];
+$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$E_Modifier = [\p{Word_Break = EM}];
-$GAZ = [\p{Word_Break = GAZ}];
+
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
+$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
#
$CR $LF;
-# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
-$ZWJ ($GAZ | $EBG);
+$ZWJ ($Extended_Pict | $EmojiNRK);
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
-$GAZ ($Extend | $Format | $ZWJ)*;
+$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
# rule 5
# rule 3
$LF $CR;
-# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
-($GAZ | $EBG) $ZWJ;
+($Extended_Pict | $EmojiNRK) $ZWJ;
# rule 4
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
-($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
-($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
#
# ICU Word Break Rules, POSIX locale.
# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 28 (draft 7) for Unicode Version 9.0
+# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
+# with additions for Emoji Sequences from https://goo.gl/cluFCn
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
$MidNum = [\p{Word_Break = MidNum} [.]];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-$E_Base = [\p{Word_Break = EB}];
+$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
$E_Modifier = [\p{Word_Break = EM}];
-$GAZ = [\p{Word_Break = GAZ}];
+
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
+$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
#
$CR $LF;
-# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
-$ZWJ ($GAZ | $EBG);
+$ZWJ ($Extended_Pict | $EmojiNRK);
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
-$GAZ ($Extend | $Format | $ZWJ)*;
+$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
# rule 5
# rule 3
$LF $CR;
-# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
+# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
-($GAZ | $EBG) $ZWJ;
+($Extended_Pict | $EmojiNRK) $ZWJ;
# rule 4
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
-($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
-($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
+($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
const uint8_t *builtRules;
if (U_FAILURE(status)) {
- errcheckln(status, "Can't open \"%s\" - %s", dataFile, u_errorName(status));
+ errcheckln(status, "%s:%d Can't open \"%s\" - %s", __FILE__, __LINE__, dataFile, u_errorName(status));
return;
}
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
if (U_FAILURE(status)) {
- errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
- u_errorName(status), parseError.line, parseError.offset);
+ errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
+ __FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
+ errln(UnicodeString(builtSource));
return;
};
rbbiRules = brkItr->getBinaryRules(length);
logln("Comparing \"%s\" len=%d", dataFile, length);
if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
- errln("Built rules and rebuilt rules are different %s", dataFile);
+ errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
return;
}
delete brkItr;
}
+//
+// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+//
+static const char *gExtended_Pict = "["
+ "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
+ "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
+ "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
+ "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
+ "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
+ "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
+ "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
+ "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
+ "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
+ "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
+ "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
+ "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
+ "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
+ "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
+ "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
+ "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
+ "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
+ "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
+ "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
+ "]";
+
//------------------------------------------------------------------------------------------
//
// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
UnicodeSet *fHangulSet;
UnicodeSet *fEmojiBaseSet;
UnicodeSet *fEmojiModifierSet;
- UnicodeSet *fGAZSet;
- UnicodeSet *fEBGSet; // ***new
+ UnicodeSet *fExtendedPictSet;
+ UnicodeSet *fEBGSet;
+ UnicodeSet *fEmojiNRKSet;
UnicodeSet *fAnySet;
const UnicodeString *fText;
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
- fRegionalIndicatorSet =
+ fRegionalIndicatorSet =
new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
fHangulSet->addAll(*fLVSet);
fHangulSet->addAll(*fLVTSet);
- fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status);
+ fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = GAZ}]"), status);
+ fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
+ fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
fAnySet = new UnicodeSet(0, 0x10ffff);
fSets = new UVector(status);
fSets->addElement(fEmojiBaseSet, status);
fSets->addElement(fEmojiModifierSet, status);
fSets->addElement(fZWJSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fExtendedPictSet, status);
fSets->addElement(fEBGSet, status);
+ fSets->addElement(fEmojiNRKSet,status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
continue;
}
- // Rule (GB11) ZWJ x (Glue_After_ZWJ | EBG)
- if (fZWJSet->contains(c1) && (fGAZSet->contains(c2) || fEBGSet->contains(c2))) {
+ // Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
+ if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
+ (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
continue;
}
delete fEmojiBaseSet;
delete fEmojiModifierSet;
delete fZWJSet;
- delete fGAZSet;
+ delete fExtendedPictSet;
delete fEBGSet;
+ delete fEmojiNRKSet;
}
//------------------------------------------------------------------------------------------
UnicodeSet *fEBGSet;
UnicodeSet *fEModifierSet;
UnicodeSet *fZWJSet;
- UnicodeSet *fGAZSet;
+ UnicodeSet *fExtendedPictSet;
+ UnicodeSet *fEmojiNRKSet;
const UnicodeString *fText;
};
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
- fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EB}]"), status);
+ fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status);
fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status);
fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status);
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = GAZ}]"), status);
+ fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
+ fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
fDictionarySet->addAll(*fKatakanaSet);
fOtherSet->removeAll(*fEBGSet);
fOtherSet->removeAll(*fEModifierSet);
fOtherSet->removeAll(*fZWJSet);
- fOtherSet->removeAll(*fGAZSet);
-
+ fOtherSet->removeAll(*fExtendedPictSet);
+ fOtherSet->removeAll(*fEmojiNRKSet);
+
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionarySet);
fSets->addElement(fEBGSet, status);
fSets->addElement(fEModifierSet, status);
fSets->addElement(fZWJSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fExtendedPictSet, status);
+ fSets->addElement(fEmojiNRKSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
break;
};
- // Rule (3c) ZWJ x (Glue_after_ZWJ | EBG).
+ // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
- if (fZWJSet->contains(fText->char32At(p2-1)) && (fGAZSet->contains(c2) || fEBGSet->contains(c2))) {
+ if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
continue;
}
delete fEBGSet;
delete fEModifierSet;
delete fZWJSet;
- delete fGAZSet;
+ delete fExtendedPictSet;
+ delete fEmojiNRKSet;
}
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZJ;
+ UnicodeSet *fExtendedPict;
+ UnicodeSet *fEmojiNRK;
BreakIterator *fCharBI;
const UnicodeString *fText;
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
- fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
+ fEB = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+ fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+ fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
if (U_FAILURE(status)) {
deferredStatus = status;
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
-
- fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
- fID->addAll(*fEM);
-
fCM->addAll(*fZJ); // ZWJ behaves as a CM.
fSets->addElement(fBK, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
fSets->addElement(fZJ, status);
+ fSets->addElement(fExtendedPict, status);
+ fSets->addElement(fEmojiNRK, status);
+
const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
break;
}
- // LB 8a ZJ x ID
+ // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
- if (fZJ->contains(prevC) && fID->contains(thisChar)) {
+ if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
continue;
}
}
if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
(fEX->contains(prevChar) && fIN->contains(thisChar)) ||
(fHL->contains(prevChar) && fIN->contains(thisChar)) ||
- (fID->contains(prevChar) && fIN->contains(thisChar)) ||
+ ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
(fIN->contains(prevChar) && fIN->contains(thisChar)) ||
(fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
continue;
delete fEB;
delete fEM;
delete fZJ;
+ delete fExtendedPict;
+ delete fEmojiNRK;
delete fCharBI;
delete fNumberMatcher;
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Default Grapheme Break Test
+# Hand-patched for Emoji ZWJ Proposal L2/16-208R2.
#
# Format:
# <string> (# <comment>)?
÷ 200D × 0308 ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
÷ 200D × 200D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
÷ 200D × 0308 × 200D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
-÷ 200D Ã\97 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
+÷ 200D ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
÷ 200D × 0308 ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
-÷ 200D Ã\97 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
+÷ 200D ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
÷ 200D × 0308 ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] BOY (EBG) ÷ [0.3]
÷ 200D ÷ 0378 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] <reserved-0378> (Other) ÷ [0.3]
÷ 200D × 0308 ÷ 0378 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] <reserved-0378> (Other) ÷ [0.3]
÷ 0061 ÷ 0600 × 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] ARABIC NUMBER SIGN (Prepend) × [9.2] LATIN SMALL LETTER B (Other) ÷ [0.3]
÷ 261D × 1F3FB ÷ 261D ÷ # ÷ [0.2] WHITE UP POINTING INDEX (E_Base) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
÷ 1F466 × 1F3FB ÷ # ÷ [0.2] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
-÷ 200D Ã\97 1F466 × 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
-÷ 200D Ã\97 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
-÷ 200D Ã\97 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
+÷ 200D ÷ 1F466 × 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
+÷ 200D ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
+÷ 200D ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
÷ 1F466 ÷ 1F466 ÷ # ÷ [0.2] BOY (EBG) ÷ [999.0] BOY (EBG) ÷ [0.3]
#
# Lines: 822
# For documentation, see http://www.unicode.org/reports/tr44/
#
# Default Word Break Test
+# Hand-patched for Emoji ZWJ Proposal L2/16-208R2.
#
# Format:
# <string> (# <comment>)?
÷ 200D × 0308 ÷ 0022 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] QUOTATION MARK (Double_Quote) ÷ [0.3]
÷ 200D ÷ 0027 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
÷ 200D × 0308 ÷ 0027 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
-÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
+÷ 200D Ã\97 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
÷ 200D × 0308 ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
-÷ 200D ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
+÷ 200D Ã\97 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
÷ 200D × 0308 ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
÷ 200D × 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [3.3] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
÷ 200D × 0308 ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: grapheme.txt
# Emoji defintions
-E_Base = [\p{Grapheme_Cluster_Break = EB}];
+EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
+E_Base = [\p{Grapheme_Cluster_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
-GAZ = [\p{Grapheme_Cluster_Break = GAZ}];
E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
GB8: (LVT | T) T;
GB10: (E_Base | E_Base_GAZ) Extend* E_Modifier;
+GB11: (Extended_Pict | EmojiNRK) ZWJ (Extended_Pict | EmojiNRK);
GB9: . (Extend | ZWJ);
GB9a: . SpacingMark;
GB9b: Prepend .;
-GB11: ZWJ (GAZ | E_Base_GAZ);
# Regional Indicators, split into pairs.
# Note that a pair of RIs that is not followed by a third RI will fall into
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: line.txt
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
-EB = [:LineBreak = EB:];
+EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
-# LB8a, from Emoji proposal L2/16-011R3
-# ZWJ x ID
-LB8a: ZWJ (ID | EB | EM);
+# LB8a
+# ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | EB | EM);
+LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
+LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | EB | EM);
+LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose.txt
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
-EB = [:LineBreak = EB:];
+EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
-# LB8a, from Emoji proposal L2/16-011R3
-# ZWJ x ID
-LB8a: ZWJ (ID | EB | EM);
+# LB8a
+# ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | EB | EM);
+LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
+LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | EB | EM);
+LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose_cj.txt
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
-EB = [:LineBreak = EB:];
+EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
EM = [:LineBreak = EM:];
EXX = [\uFF01 \uFF1F];
EX = [[:LineBreak = Exclamation:] - EXX];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
-# LB8a, from Emoji proposal L2/16-011R3
-# ZWJ x ID
-LB8a: ZWJ (ID | EB | EM);
+# LB8a
+# ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | EB | EM);
+LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
+LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | EB | EM);
+LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_normal.txt
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
-EB = [:LineBreak = EB:];
+EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
-# LB8a, from Emoji proposal L2/16-011R3
-# ZWJ x ID
-LB8a: ZWJ (ID | EB | EM);
+# LB8a
+# ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | EB | EM);
+LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
+LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | EB | EM);
+LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;
-# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved.
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_normal_cj.txt
#
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
-EB = [:LineBreak = EB:];
+EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
+EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
+
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
-# LB8a, from Emoji proposal L2/16-011R3
-# ZWJ x ID
-LB8a: ZWJ (ID | EB | EM);
+# LB8a
+# ZWJ x (ID | Extended_Pict | EmojiNRK)
+LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
-LB20.1a: CB CM* ZWJ (ID | EB | EM);
+LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
-LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
+LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
# LB31 Break Everywhere Else.
# Include combining marks
-LB31.1: . CM* ZWJ (ID | EB | EM);
+LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word.txt
MidNum = [\p{Word_Break = MidNum}];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-E_Base = [\p{Word_Break = EB}];
+E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
E_Modifier = [\p{Word_Break = EM}];
-GAZ = [\p{Word_Break = GAZ}];
+EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dicitionary, with the effect being that those characters don't appear in test data.
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
-WB3c: ZWJ (GAZ | EBG);
+WB3c: ZWJ (Extended_Pict | EmojiNRK);
WB5: AHLetter ExtFmt* AHLetter;
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
-WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (GAZ | EBG);
+WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
# Rule WB 999 Any ÷ Any
-# Interacts with WB3c, do not break between ZWJ and (GAZ | EBG).
-WB999.1: . ExtFmt* ZWJ (GAZ | EBG);
+# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
+WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB999.2: . ExtFmt* ÷;
#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word_POSIX.txt
MidNum = [\p{Word_Break = MidNum} [.]];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-E_Base = [\p{Word_Break = EB}];
+E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
E_Modifier = [\p{Word_Break = EM}];
-GAZ = [\p{Word_Break = GAZ}];
+EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
+# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dicitionary, with the effect being that those characters don't appear in test data.
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
-WB3c: ZWJ (GAZ | EBG);
+WB3c: ZWJ (Extended_Pict | EmojiNRK);
WB5: AHLetter ExtFmt* AHLetter;
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
-WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (GAZ | EBG);
+WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
# Rule WB 999 Any ÷ Any
-# Interacts with WB3c, do not break between ZWJ and (GAZ | EBG).
-WB999.1: . ExtFmt* ZWJ (GAZ | EBG);
+# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
+WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB999.2: . ExtFmt* ÷;