ICU-8151 Simplify Finnish Line Break Tailoring, move to root. (#99)

author Andy Heninger <andy.heninger@gmail.com>

Thu, 6 Sep 2018 21:23:28 +0000 (14:23 -0700)

committer Shane Carr <shane@unicode.org>

Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
author Andy Heninger <andy.heninger@gmail.com>
Thu, 6 Sep 2018 21:23:28 +0000 (14:23 -0700)
committer Shane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
diff --git a/icu4c/source/data/brkitr/brkfiles.mk b/icu4c/source/data/brkitr/brkfiles.mk

index a23c945bf549f39eb7b7be366fcea0ab4e5bcb49..af4ab0a2dd45340ad50e7458f457610e80e42507 100644 (file)
--- a/icu4c/source/data/brkitr/brkfiles.mk
+++ b/icu4c/source/data/brkitr/brkfiles.mk
@@ -39,13 +39,13 @@ BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt\
  
  
  # List of break iterator files (brk).
-BRK_SOURCE = char.txt line.txt line_fi.txt line_loose.txt\
- line_loose_cj.txt line_loose_fi.txt line_normal.txt line_normal_cj.txt line_normal_fi.txt\
+BRK_SOURCE = char.txt line.txt line_loose.txt\
+ line_loose_cj.txt line_normal.txt line_normal_cj.txt\
   sent.txt sent_el.txt title.txt word.txt word_POSIX.txt
  
  
  # Ordinary resources
  BRK_RES_SOURCE = de.txt el.txt en.txt en_US.txt\
- en_US_POSIX.txt es.txt fi.txt fr.txt it.txt\
+ en_US_POSIX.txt es.txt fr.txt it.txt\
   ja.txt pt.txt ru.txt zh.txt zh_Hant.txt
  
diff --git a/icu4c/source/data/brkitr/fi.txt b/icu4c/source/data/brkitr/fi.txt

deleted file mode 100644 (file)

index 3c07f15..0000000
--- a/icu4c/source/data/brkitr/fi.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-// © 2016 and later: Unicode, Inc. and others.
-// License & terms of use: http://www.unicode.org/copyright.html#License
-fi{
-    Version{"2.1.19.14"}
-    boundaries{
-        line:process(dependency){"line_fi.brk"}
-        line_loose:process(dependency){"line_loose_fi.brk"}
-        line_normal:process(dependency){"line_normal_fi.brk"}
-        line_strict:process(dependency){"line_fi.brk"}
-    }
-}
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt

index 9ad81e6fc7da4158d875cf1b074e70e824a5e675..cca19772dbaace3cd394d414e38a60704dafc057 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@@ -8,11 +8,10 @@
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
  #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
+#         http://www.unicode.org/reports/tr14/, with the following modification:
  #
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
  #
  #         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
  #         It sets characters of class CJ to behave like NS.
@@ -27,6 +26,7 @@
  $AI = [:LineBreak =  Ambiguous:];
  $AL = [:LineBreak =  Alphabetic:];
  $BA = [:LineBreak =  Break_After:];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
@@ -229,17 +229,24 @@ $LB18NonBreaks $CM* $QU;
  #         QU  x
  $QU $CM* .;
  
-
  # LB 20
  #        <break>  $CB
  #        $CB   <break>
-
+#
  $LB20NonBreaks = [$LB18NonBreaks - $CB];
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
+#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
+#
+^($HY | $HH) $CM* $ALPlus;
+
  # LB 21        x   (BA | HY | NS)
  #           BB x
  #
  $LB20NonBreaks $CM* ($BA | $HY | $NS);
+
+
  ^$CM+ ($BA | $HY | $NS);
  
  $BB $CM* [^$CB];                                  #  $BB  x
diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt

deleted file mode 100644 (file)

index 9c26945..0000000
--- a/icu4c/source/data/brkitr/rules/line_fi.txt
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (c) 2002-2016  International Business Machines Corporation and
-# others. All Rights Reserved.
-#
-#  file:  line_fi.txt
-#
-#         Line Breaking Rules
-#         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
-#         tailored as noted in 2nd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
-#
-#         This tailors the line break behavior for Finnish, while otherwise behaving
-#         per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
-#         It sets characters of class CJ to behave like NS.
-#
-#         This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
-#         It sets characters of class CJ to behave like NS.
-
-#
-#  Character Classes defined by TR 14.
-#
-
-!!chain;
-!!quoted_literals_only;
-
-$AI = [:LineBreak =  Ambiguous:];
-$AL = [:LineBreak =  Alphabetic:];
-$BA = [:LineBreak =  Break_After:];
-$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
-$BB = [:LineBreak =  Break_Before:];
-$BK = [:LineBreak =  Mandatory_Break:];
-$B2 = [:LineBreak =  Break_Both:];
-$CB = [:LineBreak =  Contingent_Break:];
-$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
-$CL = [:LineBreak =  Close_Punctuation:];
-# $CM = [:LineBreak =  Combining_Mark:];
-$CP = [:LineBreak =  Close_Parenthesis:];
-$CR = [:LineBreak =  Carriage_Return:];
-$EB = [:LineBreak =  EB:];
-$EM = [:LineBreak =  EM:];
-$EX = [:LineBreak =  Exclamation:];
-$GL = [:LineBreak =  Glue:];
-$HL = [:LineBreak =  Hebrew_Letter:];
-$HY = [:LineBreak =  Hyphen:];
-$H2 = [:LineBreak =  H2:];
-$H3 = [:LineBreak =  H3:];
-$ID = [:LineBreak =  Ideographic:];
-$IN = [:LineBreak =  Inseperable:];
-$IS = [:LineBreak =  Infix_Numeric:];
-$JL = [:LineBreak =  JL:];
-$JV = [:LineBreak =  JV:];
-$JT = [:LineBreak =  JT:];
-$LF = [:LineBreak =  Line_Feed:];
-$NL = [:LineBreak =  Next_Line:];
-# NS includes CJ for CSS strict line breaking.
-$NS = [[:LineBreak =  Nonstarter:] $CJ];
-$NU = [:LineBreak =  Numeric:];
-$OP = [:LineBreak =  Open_Punctuation:];
-$PO = [:LineBreak =  Postfix_Numeric:];
-$PR = [:LineBreak =  Prefix_Numeric:];
-$QU = [:LineBreak =  Quotation:];
-$RI = [:LineBreak =  Regional_Indicator:];
-$SA = [:LineBreak =  Complex_Context:];
-$SG = [:LineBreak =  Surrogate:];
-$SP = [:LineBreak =  Space:];
-$SY = [:LineBreak =  Break_Symbols:];
-$WJ = [:LineBreak =  Word_Joiner:];
-$XX = [:LineBreak =  Unknown:];
-$ZW = [:LineBreak =  ZWSpace:];
-$ZWJ = [:LineBreak = ZWJ:];
-
-# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
-#         list it in the numerous rules that use CM.
-# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
-
-$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
-
-#   Dictionary character set, for triggering language-based break engines. Currently
-#   limited to LineBreak=Complex_Context (SA).
-
-$dictionary = [$SA];
-
-#
-#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
-#                               SA  (Dictionary chars, excluding Mn and Mc)
-#                               SG  (Unpaired Surrogates)
-#                               XX  (Unknown, unassigned)
-#                         as $AL  (Alphabetic)
-#
-$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
-
-
-## -------------------------------------------------
-
-#
-# CAN_CM  is the set of characters that may combine with CM combining chars.
-#         Note that Linebreak UAX 14's concept of a combining char and the rules
-#         for what they can combine with are _very_ different from the rest of Unicode.
-#
-#         Note that $CM itself is left out of this set.  If CM is needed as a base
-#         it must be listed separately in the rule.
-#
-$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
-$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
-
-#
-# AL_FOLLOW  set of chars that can unconditionally follow an AL
-#            Needed in rules where stand-alone $CM s are treated as AL.
-#
-$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
-
-
-#
-#  Rule LB 4, 5    Mandatory (Hard) breaks.
-#
-$LB4Breaks    = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
-$CR $LF {100};
-
-#
-#  LB 6    Do not break before hard line breaks.
-#
-$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
-$CAN_CM $CM*    $LB4Breaks {100};
-^$CM+           $LB4Breaks {100};
-
-# LB 7         x SP
-#              x ZW
-$LB4NonBreaks [$SP $ZW];
-$CAN_CM $CM*  [$SP $ZW];
-^$CM+         [$SP $ZW];
-
-#
-# LB 8         Break after zero width space
-#              ZW SP* ÷
-#
-$LB8Breaks    = [$LB4Breaks $ZW];
-$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-$ZW $SP* / [^$SP $ZW $LB4Breaks];
-
-# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
-#
-$ZWJ [^$CM];
-
-# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-#                                $CM not covered by the above needs to behave like $AL
-#                                See definition of $CAN_CM.
-
-$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
-^$CM+;
-
-#
-# LB 11  Do not break before or after WORD JOINER & related characters.
-#
-$CAN_CM $CM*  $WJ;
-$LB8NonBreaks $WJ;
-^$CM+         $WJ;
-
-$WJ $CM* .;
-
-#
-# LB 12  Do not break after NBSP and related characters.
-#         GL  x
-#
-$GL $CM* .;
-
-#
-# LB 12a  Do not break before NBSP and related characters ...
-#            [^SP BA HY] x GL
-#
-[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
-^$CM+ $GL;
-
-
-
-#
-# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
-#
-$LB8NonBreaks $CL;
-$CAN_CM $CM*  $CL;
-^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $CP;
-$CAN_CM $CM*  $CP;
-^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $EX;
-$CAN_CM $CM*  $EX;
-^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $IS;
-$CAN_CM $CM*  $IS;
-^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $SY;
-$CAN_CM $CM*  $SY;
-^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
-
-
-#
-# LB 14  Do not break after OP, even after spaces
-#
-$OP $CM* $SP* .;
-
-$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
-                                   # by rule 8, CM following a SP is stand-alone.
-
-# LB 15
-$QU $CM* $SP* $OP;
-
-# LB 16
-($CL | $CP) $CM* $SP* $NS;
-
-# LB 17
-$B2 $CM* $SP* $B2;
-
-#
-# LB 18  Break after spaces.
-#
-$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
-$LB18Breaks    = [$LB8Breaks $SP];
-
-
-# LB 19
-#         x QU
-$LB18NonBreaks $CM* $QU;
-^$CM+               $QU;
-
-#         QU  x
-$QU $CM* .;
-
-
-# LB 20
-#        <break>  $CB
-#        $CB   <break>
-
-$LB20NonBreaks = [$LB18NonBreaks - $CB];
-
-# LB 20.09 added rule for Finnish tailoring
-# LB 21        x   (BA | HY | NS)
-#           BB x
-#
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
-($HY | $HH) $AL;
-^$CM+ ($BA | $HY | $HH | $NS);
-
-$BB $CM* [^$CB];                                  #  $BB  x
-$BB $CM* $LB20NonBreaks;
-
-# LB 21a Don't break after Hebrew + Hyphen
-#   HL (HY | BA) x
-#
-$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
-
-# LB 21b (forward) Don't break between SY and HL
-# (break between HL and SY already disallowed by LB 13 above)
-$SY $CM* $HL;
-
-# LB 22
-($ALPlus | $HL) $CM* $IN;
-^$CM+    $IN;     #  by rule 10, any otherwise unattached CM behaves as AL
-$EX $CM*    $IN;
-($ID | $EB | $EM) $CM*  $IN;
-$IN $CM*    $IN;
-$NU $CM*    $IN;
-
-
-# $LB 23
-#
-($ALPlus | $HL) $CM* $NU;
-^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
-$NU $CM* ($ALPlus | $HL);
-
-# LB 23a
-#
-$PR $CM* ($ID | $EB | $EM);
-($ID | $EB | $EM) $CM*  $PO;
-
-
-#
-# LB 24
-#
-($PR | $PO) $CM* ($ALPlus | $HL);
-($ALPlus | $HL) $CM* ($PR | $PO);
-^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
-
-#
-# LB 25   Numbers.
-#
-(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
-    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
-
-# LB 26  Do not break a Korean syllable
-#
-$JL $CM* ($JL | $JV | $H2 | $H3);
-($JV | $H2) $CM* ($JV | $JT);
-($JT | $H3) $CM* $JT;
-
-# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
-($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
-($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
-$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
-
-
-# LB 28   Do not break between alphabetics
-#
-($ALPlus | $HL) $CM* ($ALPlus | $HL);
-^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
-
-# LB 29
-$IS $CM* ($ALPlus | $HL);
-
-# LB 30
-($ALPlus | $HL | $NU) $CM* $OP;
-^$CM+ $OP;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
-$CP $CM* ($ALPlus | $HL | $NU);
-
-# LB 30a  Do not break between regional indicators. Break after pairs of them.
-#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
-$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
-# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
-#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
-#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
-
-# LB 30b Do not break between an Emoji Base and an Emoji Modifier
-$EB $CM* $EM;
-
-# LB 31 Break everywhere else.
-#       Match a single code point if no other rule applies.
-.;
diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt

index 2d72fdfa90742b69ab0aa8fd1791a32439aba820..1d46d625c6778e4600daeeb3178cdf9530844eae 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -9,13 +9,10 @@
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
  #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
+#         http://www.unicode.org/reports/tr14/, with the following modification:
  #
-#         tailored as noted in 2nd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
  #
  #         This tailors the line break behavior to correspond to CSS
  #         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
@@ -35,6 +32,7 @@
  $AI = [:LineBreak =  Ambiguous:];
  $AL = [:LineBreak =  Alphabetic:];
  $BA = [:LineBreak =  Break_After:];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
@@ -240,18 +238,25 @@ $LB18NonBreaks $CM* $QU;
  #         QU  x
  $QU $CM* .;
  
-
  # LB 20
  #        <break>  $CB
  #        $CB   <break>
-
+#
  $LB20NonBreaks = [$LB18NonBreaks - $CB];
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
+#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
+#
+^($HY | $HH) $CM* $ALPlus;
+
  # LB 21        x   (BA | HY | NS)
  #           BB x
  #
  # DO allow breaks here before NSX, so don't include it
  $LB20NonBreaks $CM* ($BA | $HY | $NS);
+
+
  ^$CM+ ($BA | $HY | $NS);
  
  $BB $CM* [^$CB];                                  #  $BB  x
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt

index 024e68ebc77ce161575e5c7c4ea12aa2f237fc4e..b139da3240214e918c90ff74f2ba3f47f7c399b7 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -8,13 +8,10 @@
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
  #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
+#         http://www.unicode.org/reports/tr14/, with the following modification:
  #
-#         tailored as noted in 2nd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
  #
  #         This tailors the line break behavior to correspond to CSS
  #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
@@ -42,6 +39,7 @@ $AI = [:LineBreak =  Ambiguous:];
  $AL = [:LineBreak =  Alphabetic:];
  $BAX = [\u2010 \u2013];
  $BA = [[:LineBreak =  Break_After:] - $BAX];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
@@ -250,18 +248,25 @@ $LB18NonBreaks $CM* $QU;
  #         QU  x
  $QU $CM* .;
  
-
  # LB 20
  #        <break>  $CB
  #        $CB   <break>
-
+#
  $LB20NonBreaks = [$LB18NonBreaks - $CB];
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
+#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
+#
+^($HY | $HH) $CM* $ALPlus;
+
  # LB 21        x   (BA | HY | NS)
  #           BB x
  #
  # DO allow breaks here before $BAX and $NSX, so don't include them
  $LB20NonBreaks $CM* ($BA | $HY | $NS);
+
+
  ^$CM+ ($BA | $HY | $NS);
  
  $BB $CM* [^$CB];                                  #  $BB  x
diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt

deleted file mode 100644 (file)

index 0c34b00..0000000
--- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (c) 2002-2016  International Business Machines Corporation and
-# others. All Rights Reserved.
-#
-#  file:  line_loose_fi.txt
-#
-#         Line Breaking Rules
-#         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
-#         tailored as noted in 3rd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
-#
-#         This tailors the line break behavior both for Finnish and to correpond to CSS
-#         line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
-#         Chinese & Japanese.
-#         It sets characters of class CJ to behave like ID.
-#         In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS).
-#
-#  Character Classes defined by TR 14.
-#
-
-!!chain;
-!!quoted_literals_only;
-
-$AI = [:LineBreak =  Ambiguous:];
-$AL = [:LineBreak =  Alphabetic:];
-$BA = [:LineBreak =  Break_After:];
-$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
-$BB = [:LineBreak =  Break_Before:];
-$BK = [:LineBreak =  Mandatory_Break:];
-$B2 = [:LineBreak =  Break_Both:];
-$CB = [:LineBreak =  Contingent_Break:];
-$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
-$CL = [:LineBreak =  Close_Punctuation:];
-# $CM = [:LineBreak =  Combining_Mark:];
-$CP = [:LineBreak =  Close_Parenthesis:];
-$CR = [:LineBreak =  Carriage_Return:];
-$EB = [:LineBreak =  EB:];
-$EM = [:LineBreak =  EM:];
-$EX = [:LineBreak =  Exclamation:];
-$GL = [:LineBreak =  Glue:];
-$HL = [:LineBreak =  Hebrew_Letter:];
-$HY = [:LineBreak =  Hyphen:];
-$H2 = [:LineBreak =  H2:];
-$H3 = [:LineBreak =  H3:];
-# CSS Loose tailoring: CJ resolves to ID
-$ID = [[:LineBreak =  Ideographic:] $CJ];
-$IN = [:LineBreak =  Inseperable:];
-$IS = [:LineBreak =  Infix_Numeric:];
-$JL = [:LineBreak =  JL:];
-$JV = [:LineBreak =  JV:];
-$JT = [:LineBreak =  JT:];
-$LF = [:LineBreak =  Line_Feed:];
-$NL = [:LineBreak =  Next_Line:];
-$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
-$NS = [[:LineBreak =  Nonstarter:] - $NSX];
-$NU = [:LineBreak =  Numeric:];
-$OP = [:LineBreak =  Open_Punctuation:];
-$PO = [:LineBreak =  Postfix_Numeric:];
-$PR = [:LineBreak =  Prefix_Numeric:];
-$QU = [:LineBreak =  Quotation:];
-$RI = [:LineBreak =  Regional_Indicator:];
-$SA = [:LineBreak =  Complex_Context:];
-$SG = [:LineBreak =  Surrogate:];
-$SP = [:LineBreak =  Space:];
-$SY = [:LineBreak =  Break_Symbols:];
-$WJ = [:LineBreak =  Word_Joiner:];
-$XX = [:LineBreak =  Unknown:];
-$ZW = [:LineBreak =  ZWSpace:];
-$ZWJ = [:LineBreak = ZWJ:];
-
-# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
-#         list it in the numerous rules that use CM.
-# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
-
-$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
-
-#   Dictionary character set, for triggering language-based break engines. Currently
-#   limited to LineBreak=Complex_Context (SA).
-
-$dictionary = [$SA];
-
-#
-#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
-#                               SA  (Dictionary chars, excluding Mn and Mc)
-#                               SG  (Unpaired Surrogates)
-#                               XX  (Unknown, unassigned)
-#                         as $AL  (Alphabetic)
-#
-$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
-
-
-## -------------------------------------------------
-
-#
-# CAN_CM  is the set of characters that may combine with CM combining chars.
-#         Note that Linebreak UAX 14's concept of a combining char and the rules
-#         for what they can combine with are _very_ different from the rest of Unicode.
-#
-#         Note that $CM itself is left out of this set.  If CM is needed as a base
-#         it must be listed separately in the rule.
-#
-$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
-$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
-
-#
-# AL_FOLLOW  set of chars that can unconditionally follow an AL
-#            Needed in rules where stand-alone $CM s are treated as AL.
-#
-$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
-
-
-#
-#  Rule LB 4, 5    Mandatory (Hard) breaks.
-#
-$LB4Breaks    = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
-$CR $LF {100};
-
-#
-#  LB 6    Do not break before hard line breaks.
-#
-$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
-$CAN_CM $CM*    $LB4Breaks {100};
-^$CM+           $LB4Breaks {100};
-
-# LB 7         x SP
-#              x ZW
-$LB4NonBreaks [$SP $ZW];
-$CAN_CM $CM*  [$SP $ZW];
-^$CM+         [$SP $ZW];
-
-#
-# LB 8         Break after zero width space
-#              ZW SP* ÷
-#
-$LB8Breaks    = [$LB4Breaks $ZW];
-$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-$ZW $SP* / [^$SP $ZW $LB4Breaks];
-
-# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
-#
-$ZWJ [^$CM];
-
-# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-#                                $CM not covered by the above needs to behave like $AL
-#                                See definition of $CAN_CM.
-
-$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
-^$CM+;
-
-#
-# LB 11  Do not break before or after WORD JOINER & related characters.
-#
-$CAN_CM $CM*  $WJ;
-$LB8NonBreaks $WJ;
-^$CM+         $WJ;
-
-$WJ $CM* .;
-
-#
-# LB 12  Do not break after NBSP and related characters.
-#         GL  x
-#
-$GL $CM* .;
-
-#
-# LB 12a  Do not break before NBSP and related characters ...
-#            [^SP BA HY] x GL
-#
-[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
-^$CM+ $GL;
-
-
-
-#
-# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
-#
-$LB8NonBreaks $CL;
-$CAN_CM $CM*  $CL;
-^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $CP;
-$CAN_CM $CM*  $CP;
-^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $EX;
-$CAN_CM $CM*  $EX;
-^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $IS;
-$CAN_CM $CM*  $IS;
-^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $SY;
-$CAN_CM $CM*  $SY;
-^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
-
-
-#
-# LB 14  Do not break after OP, even after spaces
-#
-$OP $CM* $SP* .;
-
-$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
-                                   # by rule 8, CM following a SP is stand-alone.
-
-# LB 15
-$QU $CM* $SP* $OP;
-
-# LB 16
-# Do not break between closing punctuation and $NS, even with intervening spaces
-# But DO allow a break between closing punctuation and $NSX, don't include it here
-($CL | $CP) $CM* $SP* $NS;
-
-# LB 17
-$B2 $CM* $SP* $B2;
-
-#
-# LB 18  Break after spaces.
-#
-$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
-$LB18Breaks    = [$LB8Breaks $SP];
-
-
-# LB 19
-#         x QU
-$LB18NonBreaks $CM* $QU;
-^$CM+               $QU;
-
-#         QU  x
-$QU $CM* .;
-
-
-# LB 20
-#        <break>  $CB
-#        $CB   <break>
-
-$LB20NonBreaks = [$LB18NonBreaks - $CB];
-
-# LB 20.09 added rule for Finnish tailoring
-# LB 21        x   (BA | HY | NS)
-#           BB x
-#
-# DO allow breaks here before NSX, so don't include it
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
-($HY | $HH) $AL;
-^$CM+ ($BA | $HY | $HH | $NS);
-
-$BB $CM* [^$CB];                                  #  $BB  x
-$BB $CM* $LB20NonBreaks;
-
-# LB 21a Don't break after Hebrew + Hyphen
-#   HL (HY | BA) x
-#
-$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
-
-# LB 21b (forward) Don't break between SY and HL
-# (break between HL and SY already disallowed by LB 13 above)
-$SY $CM* $HL;
-
-# LB 22
-($ALPlus | $HL) $CM* $IN;
-^$CM+    $IN;     #  by rule 10, any otherwise unattached CM behaves as AL
-$EX $CM*    $IN;
-($ID | $EB | $EM) $CM*  $IN;
-# $IN $CM*    $IN;  # delete this rule for CSS loose
-$NU $CM*    $IN;
-
-
-# $LB 23
-#
-($ALPlus | $HL) $CM* $NU;
-^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
-$NU $CM* ($ALPlus | $HL);
-
-# LB 23a
-#
-$PR $CM* ($ID | $EB | $EM);
-($ID | $EB | $EM) $CM*  $PO;
-
-
-#
-# LB 24
-#
-($PR | $PO) $CM* ($ALPlus | $HL);
-($ALPlus | $HL) $CM* ($PR | $PO);
-^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
-
-#
-# LB 25   Numbers.
-#
-(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
-    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
-
-# LB 26  Do not break a Korean syllable
-#
-$JL $CM* ($JL | $JV | $H2 | $H3);
-($JV | $H2) $CM* ($JV | $JT);
-($JT | $H3) $CM* $JT;
-
-# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
-($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
-($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
-$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
-
-
-# LB 28   Do not break between alphabetics
-#
-($ALPlus | $HL) $CM* ($ALPlus | $HL);
-^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
-
-# LB 29
-$IS $CM* ($ALPlus | $HL);
-
-# LB 30
-($ALPlus | $HL | $NU) $CM* $OP;
-^$CM+ $OP;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
-$CP $CM* ($ALPlus | $HL | $NU);
-
-# LB 30a  Do not break between regional indicators. Break after pairs of them.
-#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
-$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
-# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
-#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
-#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
-
-# LB 30b Do not break between an Emoji Base and an Emoji Modifier
-$EB $CM* $EM;
-
-# LB 31 Break everywhere else.
-#       Match a single code point if no other rule applies.
-.;
diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt

index b2472177e497430f2ce4634e7b0b8eff81ff720f..b753a4c00119da273f53d33399b675f4ba8874dc 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -8,13 +8,10 @@
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
  #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
+#         http://www.unicode.org/reports/tr14/, with the following modification:
  #
-#         tailored as noted in 2nd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
  #
  #         This tailors the line break behavior to correspond to CSS
  #         line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
@@ -31,6 +28,7 @@
  $AI = [:LineBreak =  Ambiguous:];
  $AL = [:LineBreak =  Alphabetic:];
  $BA = [:LineBreak =  Break_After:];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
@@ -233,17 +231,24 @@ $LB18NonBreaks $CM* $QU;
  #         QU  x
  $QU $CM* .;
  
-
  # LB 20
  #        <break>  $CB
  #        $CB   <break>
-
+#
  $LB20NonBreaks = [$LB18NonBreaks - $CB];
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
+#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
+#
+^($HY | $HH) $CM* $ALPlus;
+
  # LB 21        x   (BA | HY | NS)
  #           BB x
  #
  $LB20NonBreaks $CM* ($BA | $HY | $NS);
+
+
  ^$CM+ ($BA | $HY | $NS);
  
  $BB $CM* [^$CB];                                  #  $BB  x
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt

index b4fcf029e727fc51f531dc9bf16ad57e13a9b2bd..4eb8929c98216691e50dea138cce6cb15461bc63 100644 (file)
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -8,13 +8,10 @@
  #         Line Breaking Rules
  #         Implement default line breaking as defined by
  #         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
+#         http://www.unicode.org/reports/tr14/, with the following modification:
  #
-#         tailored as noted in 2nd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
+#         Boundaries between hyphens and following letters are suppressed when
+#         there is a boundary preceding the hyphen. See rule 20.9
  #
  #         This tailors the line break behavior to correspond to CSS
  #         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
@@ -33,6 +30,7 @@ $AI = [:LineBreak =  Ambiguous:];
  $AL = [:LineBreak =  Alphabetic:];
  $BAX = [\u2010 \u2013];
  $BA = [[:LineBreak =  Break_After:] - $BAX];
+$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
  $BB = [:LineBreak =  Break_Before:];
  $BK = [:LineBreak =  Mandatory_Break:];
  $B2 = [:LineBreak =  Break_Both:];
@@ -238,18 +236,25 @@ $LB18NonBreaks $CM* $QU;
  #         QU  x
  $QU $CM* .;
  
-
  # LB 20
  #        <break>  $CB
  #        $CB   <break>
-
+#
  $LB20NonBreaks = [$LB18NonBreaks - $CB];
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+#             Originally added as a Finnish tailoring, now promoted to default ICU behavior.
+#             Note: this is not default UAX-14 behaviour. See issue ICU-8151.
+#
+^($HY | $HH) $CM* $ALPlus;
+
  # LB 21        x   (BA | HY | NS)
  #           BB x
  #
-# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+# DO allow breaks here before $BAX and $NSX, so don't include them
  $LB20NonBreaks $CM* ($BA | $HY | $NS);
+
+
  ^$CM+ ($BA | $HY | $NS);
  
  $BB $CM* [^$CB];                                  #  $BB  x
diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt

deleted file mode 100644 (file)

index a3eccf2..0000000
--- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (c) 2002-2016  International Business Machines Corporation and
-# others. All Rights Reserved.
-#
-#  file:  line_normal_fi.txt
-#
-#         Line Breaking Rules
-#         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
-#         http://www.unicode.org/reports/tr14/
-#         tailored as noted in 3rd paragraph below.
-#
-#         TODO:  Rule LB 8 remains as it was in Unicode 5.2
-#         This is only because of a limitation of ICU break engine implementation,
-#         not because the older behavior is desirable.
-#
-#         This tailors the line break behavior for Finnish, and to correspond to CSS
-#         line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
-#         Chinese & Japanese.
-#         It sets characters of class CJ to behave like ID.
-
-#
-#  Character Classes defined by TR 14.
-#
-
-!!chain;
-!!quoted_literals_only;
-
-$AI = [:LineBreak =  Ambiguous:];
-$AL = [:LineBreak =  Alphabetic:];
-$BA = [:LineBreak =  Break_After:];
-$HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
-$BB = [:LineBreak =  Break_Before:];
-$BK = [:LineBreak =  Mandatory_Break:];
-$B2 = [:LineBreak =  Break_Both:];
-$CB = [:LineBreak =  Contingent_Break:];
-$CJ = [:LineBreak =  Conditional_Japanese_Starter:];
-$CL = [:LineBreak =  Close_Punctuation:];
-# $CM = [:LineBreak =  Combining_Mark:];
-$CP = [:LineBreak =  Close_Parenthesis:];
-$CR = [:LineBreak =  Carriage_Return:];
-$EB = [:LineBreak =  EB:];
-$EM = [:LineBreak =  EM:];
-$EX = [:LineBreak =  Exclamation:];
-$GL = [:LineBreak =  Glue:];
-$HL = [:LineBreak =  Hebrew_Letter:];
-$HY = [:LineBreak =  Hyphen:];
-$H2 = [:LineBreak =  H2:];
-$H3 = [:LineBreak =  H3:];
-# CSS Normal tailoring: CJ resolves to ID
-$ID = [[:LineBreak =  Ideographic:] $CJ];
-$IN = [:LineBreak =  Inseperable:];
-$IS = [:LineBreak =  Infix_Numeric:];
-$JL = [:LineBreak =  JL:];
-$JV = [:LineBreak =  JV:];
-$JT = [:LineBreak =  JT:];
-$LF = [:LineBreak =  Line_Feed:];
-$NL = [:LineBreak =  Next_Line:];
-$NS = [:LineBreak =  Nonstarter:];
-$NU = [:LineBreak =  Numeric:];
-$OP = [:LineBreak =  Open_Punctuation:];
-$PO = [:LineBreak =  Postfix_Numeric:];
-$PR = [:LineBreak =  Prefix_Numeric:];
-$QU = [:LineBreak =  Quotation:];
-$RI = [:LineBreak =  Regional_Indicator:];
-$SA = [:LineBreak =  Complex_Context:];
-$SG = [:LineBreak =  Surrogate:];
-$SP = [:LineBreak =  Space:];
-$SY = [:LineBreak =  Break_Symbols:];
-$WJ = [:LineBreak =  Word_Joiner:];
-$XX = [:LineBreak =  Unknown:];
-$ZW = [:LineBreak =  ZWSpace:];
-$ZWJ = [:LineBreak = ZWJ:];
-
-# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
-#         list it in the numerous rules that use CM.
-# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
-
-$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
-
-#   Dictionary character set, for triggering language-based break engines. Currently
-#   limited to LineBreak=Complex_Context (SA).
-
-$dictionary = [$SA];
-
-#
-#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
-#                               SA  (Dictionary chars, excluding Mn and Mc)
-#                               SG  (Unpaired Surrogates)
-#                               XX  (Unknown, unassigned)
-#                         as $AL  (Alphabetic)
-#
-$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
-
-
-## -------------------------------------------------
-
-#
-# CAN_CM  is the set of characters that may combine with CM combining chars.
-#         Note that Linebreak UAX 14's concept of a combining char and the rules
-#         for what they can combine with are _very_ different from the rest of Unicode.
-#
-#         Note that $CM itself is left out of this set.  If CM is needed as a base
-#         it must be listed separately in the rule.
-#
-$CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
-$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
-
-#
-# AL_FOLLOW  set of chars that can unconditionally follow an AL
-#            Needed in rules where stand-alone $CM s are treated as AL.
-#
-$AL_FOLLOW      = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
-
-
-#
-#  Rule LB 4, 5    Mandatory (Hard) breaks.
-#
-$LB4Breaks    = [$BK $CR $LF $NL];
-$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
-$CR $LF {100};
-
-#
-#  LB 6    Do not break before hard line breaks.
-#
-$LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
-$CAN_CM $CM*    $LB4Breaks {100};
-^$CM+           $LB4Breaks {100};
-
-# LB 7         x SP
-#              x ZW
-$LB4NonBreaks [$SP $ZW];
-$CAN_CM $CM*  [$SP $ZW];
-^$CM+         [$SP $ZW];
-
-#
-# LB 8         Break after zero width space
-#              ZW SP* ÷
-#
-$LB8Breaks    = [$LB4Breaks $ZW];
-$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
-$ZW $SP* / [^$SP $ZW $LB4Breaks];
-
-# LB 8a        ZWJ x            Do not break Emoji ZWJ sequences.
-#
-$ZWJ [^$CM];
-
-# LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
-#                                $CM not covered by the above needs to behave like $AL
-#                                See definition of $CAN_CM.
-
-$CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
-^$CM+;
-
-#
-# LB 11  Do not break before or after WORD JOINER & related characters.
-#
-$CAN_CM $CM*  $WJ;
-$LB8NonBreaks $WJ;
-^$CM+         $WJ;
-
-$WJ $CM* .;
-
-#
-# LB 12  Do not break after NBSP and related characters.
-#         GL  x
-#
-$GL $CM* .;
-
-#
-# LB 12a  Do not break before NBSP and related characters ...
-#            [^SP BA HY] x GL
-#
-[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
-^$CM+ $GL;
-
-
-
-#
-# LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
-#
-$LB8NonBreaks $CL;
-$CAN_CM $CM*  $CL;
-^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $CP;
-$CAN_CM $CM*  $CP;
-^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $EX;
-$CAN_CM $CM*  $EX;
-^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $IS;
-$CAN_CM $CM*  $IS;
-^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
-
-$LB8NonBreaks $SY;
-$CAN_CM $CM*  $SY;
-^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
-
-
-#
-# LB 14  Do not break after OP, even after spaces
-#
-$OP $CM* $SP* .;
-
-$OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
-                                   # by rule 8, CM following a SP is stand-alone.
-
-# LB 15
-$QU $CM* $SP* $OP;
-
-# LB 16
-($CL | $CP) $CM* $SP* $NS;
-
-# LB 17
-$B2 $CM* $SP* $B2;
-
-#
-# LB 18  Break after spaces.
-#
-$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
-$LB18Breaks    = [$LB8Breaks $SP];
-
-
-# LB 19
-#         x QU
-$LB18NonBreaks $CM* $QU;
-^$CM+               $QU;
-
-#         QU  x
-$QU $CM* .;
-
-
-# LB 20
-#        <break>  $CB
-#        $CB   <break>
-
-$LB20NonBreaks = [$LB18NonBreaks - $CB];
-
-# LB 20.09 added rule for Finnish tailoring
-# LB 21        x   (BA | HY | NS)
-#           BB x
-#
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
-$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
-($HY | $HH) $AL;
-^$CM+ ($BA | $HY | $HH | $NS);
-
-$BB $CM* [^$CB];                                  #  $BB  x
-$BB $CM* $LB20NonBreaks;
-
-# LB 21a Don't break after Hebrew + Hyphen
-#   HL (HY | BA) x
-#
-$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
-
-# LB 21b (forward) Don't break between SY and HL
-# (break between HL and SY already disallowed by LB 13 above)
-$SY $CM* $HL;
-
-# LB 22
-($ALPlus | $HL) $CM* $IN;
-^$CM+    $IN;     #  by rule 10, any otherwise unattached CM behaves as AL
-$EX $CM*    $IN;
-($ID | $EB | $EM) $CM*  $IN;
-$IN $CM*    $IN;
-$NU $CM*    $IN;
-
-
-# $LB 23
-#
-($ALPlus | $HL) $CM* $NU;
-^$CM+  $NU;       # Rule 10, any otherwise unattached CM behaves as AL
-$NU $CM* ($ALPlus | $HL);
-
-# LB 23a
-#
-$PR $CM* ($ID | $EB | $EM);
-($ID | $EB | $EM) $CM*  $PO;
-
-
-#
-# LB 24
-#
-($PR | $PO) $CM* ($ALPlus | $HL);
-($ALPlus | $HL) $CM* ($PR | $PO);
-^$CM+ ($PR | $PO);       # Rule 10, any otherwise unattached CM behaves as AL
-
-#
-# LB 25   Numbers.
-#
-(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
-    ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
-
-# LB 26  Do not break a Korean syllable
-#
-$JL $CM* ($JL | $JV | $H2 | $H3);
-($JV | $H2) $CM* ($JV | $JT);
-($JT | $H3) $CM* $JT;
-
-# LB 27  Treat korean Syllable Block the same as ID  (don't break it)
-($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
-($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
-$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
-
-
-# LB 28   Do not break between alphabetics
-#
-($ALPlus | $HL) $CM* ($ALPlus | $HL);
-^$CM+ ($ALPlus | $HL);      # The $CM+ is from rule 10, an unattached CM is treated as AL
-
-# LB 29
-$IS $CM* ($ALPlus | $HL);
-
-# LB 30
-($ALPlus | $HL | $NU) $CM* $OP;
-^$CM+ $OP;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
-$CP $CM* ($ALPlus | $HL | $NU);
-
-# LB 30a  Do not break between regional indicators. Break after pairs of them.
-#         Tricky interaction with LB8a: ZWJ x .   together with ZWJ acting like a CM.
-$RI $CM* $RI                 / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
-$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
-# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
-#       because of the chain-out behavior difference. The rule must chain out only from the [set characters],
-#       not from the preceding $RI or $CM, which it would be able to do if the set were optional.
-
-# LB 30b Do not break between an Emoji Base and an Emoji Modifier
-$EB $CM* $EM;
-
-# LB 31 Break everywhere else.
-#       Match a single code point if no other rule applies.
-.;
diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c

index fa78413be498f7967acfc4d0efc8de5248670668..92e6c1af4fdf7ba2ad29121125fa0e1dc6b4d4fd 100644 (file)
--- a/icu4c/source/test/cintltst/cbiapts.c
+++ b/icu4c/source/test/cintltst/cbiapts.c
@@ -784,15 +784,18 @@ static const int32_t heTestOffs_heFwd[] = {  1,  5,  7,  9, 12, 14,     19,
  /*static const int32_t heTestOffs_enRev[] = { 22, 19, 17, 14, 12,  9,  7,  5,  1,  0 };*/
  static const int32_t heTestOffs_heRev[] = {     19,     14, 12,  9,  7,  5,  1,  0 };
  
-/* Finnish line break tailoring, for cldrbug 3029 */
+/* Finnish line break tailoring, for cldrbug 3029.
+ * As of ICU 63, Finnish tailoring moved to root, Finnish and English should be the same. */
  static const UChar fiTest[] = { /* 00 */ 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
                                  /* 05 */ 0x0061, 0x002D, 0x006B, 0x0020,
                                  /* 09 */ 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
                                  /* 14 */ 0x0061, 0x0020, 0x002D, 0x006B, 0x0020,
                                  /* 19 */ 0x0061, 0x0300, 0x0020, 0x2010, 0x006B, 0x0020, 0 };
-static const int32_t fiTestOffs_enFwd[] =  {  1,  5,  7,  9, 12, 14, 16, 17, 19, 22, 23, 25 };
+//static const int32_t fiTestOffs_enFwd[] =  {  1,  5,  7,  9, 12, 14, 16, 17, 19, 22, 23, 25 };
+static const int32_t fiTestOffs_enFwd[] =  {  1,  5,  7,  9, 12, 14, 16,     19, 22,     25 };
  static const int32_t fiTestOffs_fiFwd[] =  {  1,  5,  7,  9, 12, 14, 16,     19, 22,     25 };
-static const int32_t fiTestOffs_enRev[] =  { 23, 22, 19, 17, 16, 14, 12,  9,  7,  5,  1,  0 };
+//static const int32_t fiTestOffs_enRev[] =  { 23, 22, 19, 17, 16, 14, 12,  9,  7,  5,  1,  0 };
+static const int32_t fiTestOffs_enRev[] =  {     22, 19,     16, 14, 12,  9,  7,  5,  1,  0 };
  static const int32_t fiTestOffs_fiRev[] =  {     22, 19,     16, 14, 12,  9,  7,  5,  1,  0 };
  
  /* Khmer dictionary-based work break, for ICU ticket #8329 */
diff --git a/icu4c/source/test/intltest/rbbimonkeytest.cpp b/icu4c/source/test/intltest/rbbimonkeytest.cpp

index f4fb7016dc306e2abd6ecc29cfd97e3f5562582e..2a45ae6fad21751cddc72151b11e72fe92aef202 100644 (file)
--- a/icu4c/source/test/intltest/rbbimonkeytest.cpp
+++ b/icu4c/source/test/intltest/rbbimonkeytest.cpp
@@ -184,6 +184,14 @@ void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definit
      }
      fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
  
+    // If rule begins with a '^' rule chaining is disallowed.
+    // Strip off the '^' from the rule expression, and set the flag.
+    if (thisRule->fExpandedRule.charAt(0) == u'^') {
+        thisRule->fInitialMatchOnly = true;
+        thisRule->fExpandedRule.remove(0, 1);
+        thisRule->fExpandedRule.trim();
+    }
+
      // Replace the divide sign (\u00f7) with a regular expression named capture.
      // When running the rules, a match that includes this group means we found a break position.
  
@@ -442,6 +450,8 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
                                               // ICU always reports a break there.
                                               // The reference rules do not have a means to do so.
      int32_t strIdx = 0;
+    bool    initialMatch = true;             // True at start of text, and immediately after each boundary,
+                                             // for control over rule chaining.
      while (strIdx < fString.length()) {
          BreakRule *matchingRule = NULL;
          UBool      hasBreak = FALSE;
@@ -451,6 +461,10 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
          int32_t breakGroup = 0;
          for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
              BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
+            if (rule->fInitialMatchOnly && !initialMatch) {
+                // Skip checking this '^' rule. (No rule chaining)
+                continue;
+            }
              rule->fRuleMatcher->reset();
              if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
                  // A candidate rule match, check further to see if we take it or continue to check other rules.
@@ -512,10 +526,12 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
              // which may differ from end of the match. The matching rule may have included
              // context following the boundary that needs to be looked at again.
              strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
+            initialMatch = true;
          } else {
              // Original rule didn't specify a break.
              // Continue applying rules starting on the last code point of this match.
              strIdx = fString.moveIndex32(matchEnd, -1);
+            initialMatch = false;
              if (strIdx == matchStart) {
                  // Match was only one code point, no progress if we continue.
                  // Shouldn't get here, case is filtered out at top of loop.
diff --git a/icu4c/source/test/intltest/rbbimonkeytest.h b/icu4c/source/test/intltest/rbbimonkeytest.h

index 2ddc2bd47ae67190172b5952e900ddda40b4b922..54d23fcceae78045a11a8f883fcbcf0960a985bf 100644 (file)
--- a/icu4c/source/test/intltest/rbbimonkeytest.h
+++ b/icu4c/source/test/intltest/rbbimonkeytest.h
@@ -102,6 +102,7 @@ class BreakRule: public UObject {
      UnicodeString    fRule;                            // Rule expression, excluding the name, as written in user source.
      UnicodeString    fExpandedRule;                    // Rule expression after expanding the set definitions.
      LocalPointer<RegexMatcher>  fRuleMatcher;          // Regular expression that matches the rule.
+    bool             fInitialMatchOnly = false;        // True if rule begins with '^', meaning no chaining.
  };
  
  
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index acf6a57779cfd8d5ec5bf9e46de6cd0a02a1665c..048e2bb8ed51cd8b0393a1106aa1adeb3c6f795b 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -1284,7 +1284,8 @@ void RBBITest::TestUnicodeFiles() {
  
  // Check for test cases from the Unicode test data files that are known to fail
  // and should be skipped as known issues because ICU does not fully implement
-// the Unicode specifications.
+// the Unicode specifications, or because ICU includes tailorings that differ from
+// the Unicode standard.
  //
  // Test cases are identified by the test data sequence, which tends to be more stable
  // across Unicode versions than the test file line numbers.
@@ -1297,7 +1298,18 @@ UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *
          const char *fFileName;
          const UChar *fString;
      } badTestCases[] = {
-        {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}    // Fake example, for illustration.
+        {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
+        // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
+        // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
+        // ICU is out of sync with Unicode.
+        {"8151",  "LineBreakTest.txt", u"-#"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
      };
  
      for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
@@ -2516,6 +2528,7 @@ private:
      UnicodeSet  *fB2;
      UnicodeSet  *fBA;
      UnicodeSet  *fBB;
+    UnicodeSet  *fHH;
      UnicodeSet  *fHY;
      UnicodeSet  *fH2;
      UnicodeSet  *fH3;
@@ -2580,6 +2593,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
      fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
      fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
+    fHH    = new UnicodeSet();
      fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
      fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
      fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
@@ -2620,7 +2634,9 @@ RBBILineMonkey::RBBILineMonkey() :
      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
  
      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
-    fCM->addAll(*fZWJ);     // ZWJ behaves as a CM.
+    fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
+
+    fHH->add(u'\u2010');   // Hyphen, '‐'
  
      fSets->addElement(fBK, status);
      fSets->addElement(fCR, status);
@@ -3024,6 +3040,15 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              break;
          }
  
+        // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
+        //           Formerly this was a Finnish tailoring.
+        //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
+        //    ^($HY | $HH) $AL;
+        if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
+                prevPosX2 == -1) {
+            continue;
+        }
+
          // LB 21
          if (fBA->contains(thisChar) ||
              fHY->contains(thisChar) ||
@@ -3195,6 +3220,7 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fB2;
      delete fBA;
      delete fBB;
+    delete fHH;
      delete fHY;
      delete fH2;
      delete fH3;
diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt

index d0f9abe88a1edb32dd75a04f279b0ba698d710a8..3e0324bf93dbc7790daefd5cae049ad460d08f95 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line.txt
+++ b/icu4c/source/test/testdata/break_rules/line.txt
@@ -19,6 +19,7 @@ locale = en;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -144,6 +145,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt

index 2384fa296c87c8857b1ffd7812efcd3f8fc11795..8395192365fc9df17f07515b8fc6ce1bb2a13bc7 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_loose.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose.txt
@@ -26,6 +26,7 @@ locale = en@lb=loose;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -125,7 +126,7 @@ LB12:        GL CM* [^CM];
  
  LB12a:       [^SP BA HY] CM* GL;
  
-# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
  #
  #   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
  #   LB13.2   SP    CM* [CL CP EX IS SY]
@@ -152,6 +153,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
@@ -176,7 +180,7 @@ LB23a.2:     (ID | EB | EM) CM* PO;
  LB24.2:      (PR | PO) CM* (AL | HL);
  LB24.3:      (AL | HL | CM) CM* (PR | PO);
  
-# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+# Numbers. Equivalent to Tailoring example 8 from UAX 14.
  LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
  
  LB26.1:      JL CM* (JL | JV | H2 | H3);
diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt

index 8b92561dbd8fa6b49a0286eb643b038ecc3860d7..d674327102badde51dd4c74fc87fddf0f2a63e55 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@@ -37,9 +37,10 @@ locale = ja@lb=loose;
  
  
  AI = [:LineBreak =  Ambiguous:];
-AL = [[:LineBreak =  Alphabetic:]];
+AL = [:LineBreak =  Alphabetic:];
  BAX = [\u2010 \u2013];
  BA = [[:LineBreak =  Break_After:] - BAX];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -169,6 +170,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  # LB 21a Don't break after Hebrew + Hyphen
diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt

index 65804d83f9bf787a2d561e8f670929c7f40dc584..7f5b91c42abf09dad39b52842398a2cf171a7220 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_normal.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal.txt
@@ -33,6 +33,7 @@ locale = en@lb=normal;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -158,6 +159,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt

index b50219282b26c9a131ef577f07e40c455deb1fcd..cf90751715ce5a34421643e868f3f7a6d3ee7cad 100644 (file)
--- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@@ -34,6 +34,7 @@ AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BAX = [\u2010 \u2013];
  BA = [[:LineBreak =  Break_After:] - BAX];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -163,6 +164,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  # TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index c830de15545b6a7269184a2a4002a10e3fad76a9..63ba172233d857844b2a2c833369670d907466f8 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1086,23 +1086,21 @@ Bangkok)•</data>
  # Finnish line breaking
  #
  # These rules deal with hyphens when there is a space on the leading side.
-# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
+# When followed by a letter, there should be a break opportunity between
+# the space and the hyphen, and not after the hyphen.
  # See CLDR ticket 3029.
  # See ICU ticket 8151
+# As of ICU 63, the Finnish tailoring behavior is moved to root.
  
  <locale root>
  <line>
-<data>•abc •- •def    •abc •-•def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
-<data>•abc •‐ •def    •abc •‐•def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
+<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
  <locale fi>
  <line>
-# TODO: problems with Finnish line break rules cause these two lines to fail.
-#<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
-#<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
-
-<data>•abc •- •def    •abc •-def    •abc- •def   •</data>   # With ASCII hyphen
-<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •</data>   # With Unicode u2010 hyphen
+<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
  # Test for #10176 (in fi)
  <line>
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 7ae8a0ffc2b3e438ee46929403df36f68d90d25c..198adf2fc9e59473456dc854e6714c75c54f2c2e 100644 (file)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:956147318ffa776ff18c71ab09c5ae63e336e14e240128c8602abf07ef7d7d3f
-size 12510547
+oid sha256:36d0ec0c543d1dccafcc6985a7c18285b255afb98bc2bdb16a867a22600bfddb
+size 12487287
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar

index 67e57e3ad385b5d02a0a1bd6980961bdaf743093..afa08eb95e7779da06fe3c8b50508938cd1eb59e 100755 (executable)
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:55923dda88f8bf3affc2cf6d774a92a49e5fbc4be5583769bfe90fc7f319d2b1
+oid sha256:469f76e391dced8e9ae4a9543513dddd6d4d2026ad6cbc0ab79d9553da803e6a
  size 92857
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java

index 3637b688500f106ccd5b5b2b3a08c61d4ba5a309..88af30de7005dce222509f8aab0fe17d66908be1 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java
@@ -60,10 +60,11 @@ public class RBBIMonkeyTest extends TestFmwk {
      //                    is compiled to a regular expression.
  
      static class BreakRule {
-        String    fName;                   // Name of the rule.
-        String    fRule;                   // Rule expression, excluding the name, as written in user source.
-        String    fExpandedRule;           // Rule expression after expanding the set definitions.
-        Matcher   fRuleMatcher;            // Regular expression that matches the rule.
+        String    fName;                      // Name of the rule.
+        String    fRule;                      // Rule expression, excluding the name, as written in user source.
+        String    fExpandedRule;              // Rule expression after expanding the set definitions.
+        Matcher   fRuleMatcher;               // Regular expression that matches the rule.
+        boolean   fInitialMatchOnly = false;  // True if rule begins with '^', meaning no chaining.
      };
  
  
@@ -220,6 +221,14 @@ public class RBBIMonkeyTest extends TestFmwk {
              }
              fPropertyMatcher.appendTail(expandedRule);
  
+            // If rule begins with a '^' rule chaining is disallowed.
+            // Strip off the '^' from the rule expression, and set the flag.
+            if (expandedRule.charAt(0) == '^') {
+                thisRule.fInitialMatchOnly = true;
+                expandedRule.deleteCharAt(0);
+                expandedRule = new StringBuffer(expandedRule.toString().trim());
+            }
+
              //   Replace any [^negated sets] with equivalent flattened sets generated by
              //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
              //   to any nested classes. Variable substitution in rules produces
@@ -549,6 +558,9 @@ public class RBBIMonkeyTest extends TestFmwk {
                                               // ICU always reports a break there.
                                               // The reference rules do not have a means to do so.
              int strIdx = 0;
+            boolean initialMatch = true;     // True at start of text, and immediately after each boundary,
+            //                               // for control over rule chaining.
+
              while (strIdx < fString.length()) {
                  BreakRule matchingRule = null;
                  boolean hasBreak = false;
@@ -557,6 +569,10 @@ public class RBBIMonkeyTest extends TestFmwk {
                  int matchEnd = 0;
                  for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
                      BreakRule rule = rules.fBreakRules.get(ruleNum);
+                    if (rule.fInitialMatchOnly && !initialMatch) {
+                        // Skip checking this '^' rule. (No rule chaining)
+                        continue;
+                    }
                      rule.fRuleMatcher.reset(fString.substring(strIdx));
                      if (rule.fRuleMatcher.lookingAt()) {
                          // A candidate rule match, check further to see if we take it or continue to check other rules.
@@ -607,6 +623,7 @@ public class RBBIMonkeyTest extends TestFmwk {
                      // which may differ from end of the match. The matching rule may have included
                      // context following the boundary that needs to be looked at again.
                      strIdx = breakPos;
+                    initialMatch = true;
                  } else {
                      // Original rule didn't specify a break.
                      // Continue applying rules starting on the last code point of this match.
@@ -618,6 +635,7 @@ public class RBBIMonkeyTest extends TestFmwk {
                                  rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
                      }
                      strIdx = updatedStrIdx;
+                    initialMatch = false;
                  }
              }
          };
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

index fa72431dd075eb7b391be61292fd8ea7bb208895..7b5803264c8d991d6496e1c907c55edca913f8c9 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -617,6 +617,7 @@ public class RBBITestMonkey extends TestFmwk {
          UnicodeSet  fB2;
          UnicodeSet  fBA;
          UnicodeSet  fBB;
+        UnicodeSet  fHH;
          UnicodeSet  fHY;
          UnicodeSet  fCB;
          UnicodeSet  fCL;
@@ -659,6 +660,7 @@ public class RBBITestMonkey extends TestFmwk {
  
          class XUnicodeSet extends UnicodeSet {
              XUnicodeSet(String pattern) { super(pattern); }
+            XUnicodeSet() { super(); }
              @Override
              public boolean contains(int codePoint) {
                  return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
@@ -684,6 +686,7 @@ public class RBBITestMonkey extends TestFmwk {
              fB2    = new XUnicodeSet("[\\p{Line_break=B2}]");
              fBA    = new XUnicodeSet("[\\p{Line_break=BA}]");
              fBB    = new XUnicodeSet("[\\p{Line_break=BB}]");
+            fHH    = new XUnicodeSet();
              fHY    = new XUnicodeSet("[\\p{Line_break=HY}]");
              fCB    = new XUnicodeSet("[\\p{Line_break=CB}]");
              fCL    = new XUnicodeSet("[\\p{Line_break=CL}]");
@@ -728,6 +731,8 @@ public class RBBITestMonkey extends TestFmwk {
              fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
              fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
  
+            fHH.add('\u2010');   // Hyphen, '‐'
+
              fSets.add(fBK);
              fSets.add(fCR);
              fSets.add(fLF);
@@ -786,12 +791,14 @@ public class RBBITestMonkey extends TestFmwk {
  
              int    prevPos;   //  Index of the char preceding a potential break position
              int    prevChar;  //  Character at above position.  Note that prevChar
-            //   and thisChar may not be adjacent because combining
-            //   characters between them will be ignored.
-            int    prevCharX2; //  Character before prevChar, more contex for LB 21a
+            //                //  and thisChar may not be adjacent because combining
+            //                //  characters between them will be ignored.
+
+            int    prevPosX2;
+            int    prevCharX2; //  Character before prevChar, more context for LB 21a
  
              int    nextPos;   //  Index of the next character following pos.
-            //     Usually skips over combining marks.
+            //                //  Usually skips over combining marks.
              int    tPos;      //  temp value.
              int    matchVals[]  = null;       // Number  Expression Match Results
  
@@ -804,8 +811,8 @@ public class RBBITestMonkey extends TestFmwk {
              // Initial values for loop.  Loop will run the first time without finding breaks,
              //                           while the invalid values shift out and the "this" and
              //                           "prev" positions are filled in with good values.
-            pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
-            thisChar = prevChar  = prevCharX2 = 0;
+            pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
+            thisChar = prevChar  = prevCharX2 =  0;
              nextPos  = startPos;
  
  
@@ -816,6 +823,7 @@ public class RBBITestMonkey extends TestFmwk {
              //  "prevPos" can be arbitrarily far before "pos".
              for (;;) {
                  // Advance to the next position to be tested.
+                prevPosX2  = prevPos;
                  prevCharX2 = prevChar;
                  prevPos   = pos;
                  prevChar  = thisChar;
@@ -1066,6 +1074,15 @@ public class RBBITestMonkey extends TestFmwk {
                      break;
                  }
  
+                // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
+                //           Formerly this was a Finnish tailoring.
+                //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
+                //    ^($HY | $HH) $AL;
+                if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) &&
+                        prevPosX2 == -1) {
+                    continue;
+                }
+
                  // LB 21
                  if (fBA.contains(thisChar) ||
                          fHY.contains(thisChar) ||
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt

index d0f9abe88a1edb32dd75a04f279b0ba698d710a8..3e0324bf93dbc7790daefd5cae049ad460d08f95 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@@ -19,6 +19,7 @@ locale = en;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -144,6 +145,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt

index 2384fa296c87c8857b1ffd7812efcd3f8fc11795..8395192365fc9df17f07515b8fc6ce1bb2a13bc7 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@@ -26,6 +26,7 @@ locale = en@lb=loose;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -125,7 +126,7 @@ LB12:        GL CM* [^CM];
  
  LB12a:       [^SP BA HY] CM* GL;
  
-# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
+# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
  #
  #   LB13.1   [^SP] CM* [CL CP EX IS SY]    # original UAX 14 rule.
  #   LB13.2   SP    CM* [CL CP EX IS SY]
@@ -152,6 +153,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
@@ -176,7 +180,7 @@ LB23a.2:     (ID | EB | EM) CM* PO;
  LB24.2:      (PR | PO) CM* (AL | HL);
  LB24.3:      (AL | HL | CM) CM* (PR | PO);
  
-# Numbers. Equivalent to Tailoring example 8 from UAx 14.
+# Numbers. Equivalent to Tailoring example 8 from UAX 14.
  LB25:        ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
  
  LB26.1:      JL CM* (JL | JV | H2 | H3);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt

index 8b92561dbd8fa6b49a0286eb643b038ecc3860d7..d674327102badde51dd4c74fc87fddf0f2a63e55 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -37,9 +37,10 @@ locale = ja@lb=loose;
  
  
  AI = [:LineBreak =  Ambiguous:];
-AL = [[:LineBreak =  Alphabetic:]];
+AL = [:LineBreak =  Alphabetic:];
  BAX = [\u2010 \u2013];
  BA = [[:LineBreak =  Break_After:] - BAX];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -169,6 +170,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  # LB 21a Don't break after Hebrew + Hyphen
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt

index 65804d83f9bf787a2d561e8f670929c7f40dc584..7f5b91c42abf09dad39b52842398a2cf171a7220 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@@ -33,6 +33,7 @@ locale = en@lb=normal;
  AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BA = [:LineBreak =  Break_After:];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -158,6 +159,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  LB21a:       HL CM* (HY | BA) CM* [^CM CB];
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt

index b50219282b26c9a131ef577f07e40c455deb1fcd..cf90751715ce5a34421643e868f3f7a6d3ee7cad 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -34,6 +34,7 @@ AI = [:LineBreak =  Ambiguous:];
  AL = [:LineBreak =  Alphabetic:];
  BAX = [\u2010 \u2013];
  BA = [[:LineBreak =  Break_After:] - BAX];
+HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
  BB = [:LineBreak =  Break_Before:];
  BK = [:LineBreak =  Mandatory_Break:];
  B2 = [:LineBreak =  Break_Both:];
@@ -163,6 +164,9 @@ LB20.2:      . CM* ÷ CB;
  LB20.3:      CB CM* ZWJ [^CM];
  LB20.4:      CB CM* ÷;
  
+# LB 20.09    Don't break between Hyphens and Letters when there is a break preceding the hyphen.
+LB20.09:     ^(HY | HH) CM* AL;
+
  # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
  #       not picking up the continuing match after the BA from 21a.
  # TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index c830de15545b6a7269184a2a4002a10e3fad76a9..63ba172233d857844b2a2c833369670d907466f8 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1086,23 +1086,21 @@ Bangkok)•</data>
  # Finnish line breaking
  #
  # These rules deal with hyphens when there is a space on the leading side.
-# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
+# When followed by a letter, there should be a break opportunity between
+# the space and the hyphen, and not after the hyphen.
  # See CLDR ticket 3029.
  # See ICU ticket 8151
+# As of ICU 63, the Finnish tailoring behavior is moved to root.
  
  <locale root>
  <line>
-<data>•abc •- •def    •abc •-•def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
-<data>•abc •‐ •def    •abc •‐•def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
+<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
  <locale fi>
  <line>
-# TODO: problems with Finnish line break rules cause these two lines to fail.
-#<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
-#<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
-
-<data>•abc •- •def    •abc •-def    •abc- •def   •</data>   # With ASCII hyphen
-<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •</data>   # With Unicode u2010 hyphen
+<data>•abc •- •def    •abc •-def    •abc- •def   •abc-•def•</data>   # With ASCII hyphen
+<data>•abc •‐ •def    •abc •‐def    •abc‐ •def   •abc‐•def•</data>   # With Unicode u2010 hyphen
  
  # Test for #10176 (in fi)
  <line>
author	Andy Heninger <andy.heninger@gmail.com>
	Thu, 6 Sep 2018 21:23:28 +0000 (14:23 -0700)
committer	Shane Carr <shane@unicode.org>
	Thu, 27 Sep 2018 21:27:39 +0000 (14:27 -0700)
icu4c/source/data/brkitr/brkfiles.mk		patch \| blob \| history
icu4c/source/data/brkitr/fi.txt	[deleted file]	patch \| blob \| history
icu4c/source/data/brkitr/rules/line.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_fi.txt	[deleted file]	patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_loose_fi.txt	[deleted file]	patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_cj.txt		patch \| blob \| history
icu4c/source/data/brkitr/rules/line_normal_fi.txt	[deleted file]	patch \| blob \| history
icu4c/source/test/cintltst/cbiapts.c		patch \| blob \| history
icu4c/source/test/intltest/rbbimonkeytest.cpp		patch \| blob \| history
icu4c/source/test/intltest/rbbimonkeytest.h		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_loose.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_loose_cj.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_normal.txt		patch \| blob \| history
icu4c/source/test/testdata/break_rules/line_normal_cj.txt		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/icutzdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history