From: Andy Heninger Date: Thu, 6 Sep 2018 21:23:28 +0000 (-0700) Subject: ICU-8151 Simplify Finnish Line Break Tailoring, move to root. (#99) X-Git-Tag: release-63-rc~87 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=740b24118feb07c97d9ac026a3328bd2c25b0afc;p=icu ICU-8151 Simplify Finnish Line Break Tailoring, move to root. (#99) --- diff --git a/icu4c/source/data/brkitr/brkfiles.mk b/icu4c/source/data/brkitr/brkfiles.mk index a23c945bf54..af4ab0a2dd4 100644 --- a/icu4c/source/data/brkitr/brkfiles.mk +++ b/icu4c/source/data/brkitr/brkfiles.mk @@ -39,13 +39,13 @@ BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt\ # List of break iterator files (brk). -BRK_SOURCE = char.txt line.txt line_fi.txt line_loose.txt\ - line_loose_cj.txt line_loose_fi.txt line_normal.txt line_normal_cj.txt line_normal_fi.txt\ +BRK_SOURCE = char.txt line.txt line_loose.txt\ + line_loose_cj.txt line_normal.txt line_normal_cj.txt\ sent.txt sent_el.txt title.txt word.txt word_POSIX.txt # Ordinary resources BRK_RES_SOURCE = de.txt el.txt en.txt en_US.txt\ - en_US_POSIX.txt es.txt fi.txt fr.txt it.txt\ + en_US_POSIX.txt es.txt fr.txt it.txt\ ja.txt pt.txt ru.txt zh.txt zh_Hant.txt diff --git a/icu4c/source/data/brkitr/fi.txt b/icu4c/source/data/brkitr/fi.txt deleted file mode 100644 index 3c07f152221..00000000000 --- a/icu4c/source/data/brkitr/fi.txt +++ /dev/null @@ -1,11 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License -fi{ - Version{"2.1.19.14"} - boundaries{ - line:process(dependency){"line_fi.brk"} - line_loose:process(dependency){"line_loose_fi.brk"} - line_normal:process(dependency){"line_normal_fi.brk"} - line_strict:process(dependency){"line_fi.brk"} - } -} diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 9ad81e6fc7d..cca19772dba 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -8,11 +8,10 @@ # Line Breaking Rules # Implement default line breaking as defined by # Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ +# http://www.unicode.org/reports/tr14/, with the following modification: # -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 # # This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). # It sets characters of class CJ to behave like NS. @@ -27,6 +26,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; @@ -229,17 +229,24 @@ $LB18NonBreaks $CM* $QU; # QU x $QU $CM* .; - # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # $LB20NonBreaks $CM* ($BA | $HY | $NS); + + ^$CM+ ($BA | $HY | $NS); $BB $CM* [^$CB]; # $BB x diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt deleted file mode 100644 index 9c26945e580..00000000000 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ /dev/null @@ -1,339 +0,0 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -# Copyright (c) 2002-2016 International Business Machines Corporation and -# others. All Rights Reserved. -# -# file: line_fi.txt -# -# Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. -# -# This tailors the line break behavior for Finnish, while otherwise behaving -# per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict). -# It sets characters of class CJ to behave like NS. -# -# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). -# It sets characters of class CJ to behave like NS. - -# -# Character Classes defined by TR 14. -# - -!!chain; -!!quoted_literals_only; - -$AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; -$BA = [:LineBreak = Break_After:]; -$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. -$BB = [:LineBreak = Break_Before:]; -$BK = [:LineBreak = Mandatory_Break:]; -$B2 = [:LineBreak = Break_Both:]; -$CB = [:LineBreak = Contingent_Break:]; -$CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [:LineBreak = Close_Punctuation:]; -# $CM = [:LineBreak = Combining_Mark:]; -$CP = [:LineBreak = Close_Parenthesis:]; -$CR = [:LineBreak = Carriage_Return:]; -$EB = [:LineBreak = EB:]; -$EM = [:LineBreak = EM:]; -$EX = [:LineBreak = Exclamation:]; -$GL = [:LineBreak = Glue:]; -$HL = [:LineBreak = Hebrew_Letter:]; -$HY = [:LineBreak = Hyphen:]; -$H2 = [:LineBreak = H2:]; -$H3 = [:LineBreak = H3:]; -$ID = [:LineBreak = Ideographic:]; -$IN = [:LineBreak = Inseperable:]; -$IS = [:LineBreak = Infix_Numeric:]; -$JL = [:LineBreak = JL:]; -$JV = [:LineBreak = JV:]; -$JT = [:LineBreak = JT:]; -$LF = [:LineBreak = Line_Feed:]; -$NL = [:LineBreak = Next_Line:]; -# NS includes CJ for CSS strict line breaking. -$NS = [[:LineBreak = Nonstarter:] $CJ]; -$NU = [:LineBreak = Numeric:]; -$OP = [:LineBreak = Open_Punctuation:]; -$PO = [:LineBreak = Postfix_Numeric:]; -$PR = [:LineBreak = Prefix_Numeric:]; -$QU = [:LineBreak = Quotation:]; -$RI = [:LineBreak = Regional_Indicator:]; -$SA = [:LineBreak = Complex_Context:]; -$SG = [:LineBreak = Surrogate:]; -$SP = [:LineBreak = Space:]; -$SY = [:LineBreak = Break_Symbols:]; -$WJ = [:LineBreak = Word_Joiner:]; -$XX = [:LineBreak = Unknown:]; -$ZW = [:LineBreak = ZWSpace:]; -$ZWJ = [:LineBreak = ZWJ:]; - -# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly -# list it in the numerous rules that use CM. -# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. - -$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; - -# Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context (SA). - -$dictionary = [$SA]; - -# -# Rule LB1. By default, treat AI (characters with ambiguous east Asian width), -# SA (Dictionary chars, excluding Mn and Mc) -# SG (Unpaired Surrogates) -# XX (Unknown, unassigned) -# as $AL (Alphabetic) -# -$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; - - -## ------------------------------------------------- - -# -# CAN_CM is the set of characters that may combine with CM combining chars. -# Note that Linebreak UAX 14's concept of a combining char and the rules -# for what they can combine with are _very_ different from the rest of Unicode. -# -# Note that $CM itself is left out of this set. If CM is needed as a base -# it must be listed separately in the rule. -# -$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs -$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs - -# -# AL_FOLLOW set of chars that can unconditionally follow an AL -# Needed in rules where stand-alone $CM s are treated as AL. -# -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus]; - - -# -# Rule LB 4, 5 Mandatory (Hard) breaks. -# -$LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; -$CR $LF {100}; - -# -# LB 6 Do not break before hard line breaks. -# -$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. -$CAN_CM $CM* $LB4Breaks {100}; -^$CM+ $LB4Breaks {100}; - -# LB 7 x SP -# x ZW -$LB4NonBreaks [$SP $ZW]; -$CAN_CM $CM* [$SP $ZW]; -^$CM+ [$SP $ZW]; - -# -# LB 8 Break after zero width space -# ZW SP* ÷ -# -$LB8Breaks = [$LB4Breaks $ZW]; -$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -$ZW $SP* / [^$SP $ZW $LB4Breaks]; - -# LB 8a ZWJ x Do not break Emoji ZWJ sequences. -# -$ZWJ [^$CM]; - -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL -# See definition of $CAN_CM. - -$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -^$CM+; - -# -# LB 11 Do not break before or after WORD JOINER & related characters. -# -$CAN_CM $CM* $WJ; -$LB8NonBreaks $WJ; -^$CM+ $WJ; - -$WJ $CM* .; - -# -# LB 12 Do not break after NBSP and related characters. -# GL x -# -$GL $CM* .; - -# -# LB 12a Do not break before NBSP and related characters ... -# [^SP BA HY] x GL -# -[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL; -^$CM+ $GL; - - - -# -# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. -# -$LB8NonBreaks $CL; -$CAN_CM $CM* $CL; -^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $CP; -$CAN_CM $CM* $CP; -^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $EX; -$CAN_CM $CM* $EX; -^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $IS; -$CAN_CM $CM* $IS; -^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $SY; -$CAN_CM $CM* $SY; -^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL - - -# -# LB 14 Do not break after OP, even after spaces -# -$OP $CM* $SP* .; - -$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL - # by rule 8, CM following a SP is stand-alone. - -# LB 15 -$QU $CM* $SP* $OP; - -# LB 16 -($CL | $CP) $CM* $SP* $NS; - -# LB 17 -$B2 $CM* $SP* $B2; - -# -# LB 18 Break after spaces. -# -$LB18NonBreaks = [$LB8NonBreaks - [$SP]]; -$LB18Breaks = [$LB8Breaks $SP]; - - -# LB 19 -# x QU -$LB18NonBreaks $CM* $QU; -^$CM+ $QU; - -# QU x -$QU $CM* .; - - -# LB 20 -# $CB -# $CB - -$LB20NonBreaks = [$LB18NonBreaks - $CB]; - -# LB 20.09 added rule for Finnish tailoring -# LB 21 x (BA | HY | NS) -# BB x -# -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL; -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS); -($HY | $HH) $AL; -^$CM+ ($BA | $HY | $HH | $NS); - -$BB $CM* [^$CB]; # $BB x -$BB $CM* $LB20NonBreaks; - -# LB 21a Don't break after Hebrew + Hyphen -# HL (HY | BA) x -# -$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?; - -# LB 21b (forward) Don't break between SY and HL -# (break between HL and SY already disallowed by LB 13 above) -$SY $CM* $HL; - -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -$IN $CM* $IN; -$NU $CM* $IN; - - -# $LB 23 -# -($ALPlus | $HL) $CM* $NU; -^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL -$NU $CM* ($ALPlus | $HL); - -# LB 23a -# -$PR $CM* ($ID | $EB | $EM); -($ID | $EB | $EM) $CM* $PO; - - -# -# LB 24 -# -($PR | $PO) $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* ($PR | $PO); -^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL - -# -# LB 25 Numbers. -# -(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))* - ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?; - -# LB 26 Do not break a Korean syllable -# -$JL $CM* ($JL | $JV | $H2 | $H3); -($JV | $H2) $CM* ($JV | $JT); -($JT | $H3) $CM* $JT; - -# LB 27 Treat korean Syllable Block the same as ID (don't break it) -($JL | $JV | $JT | $H2 | $H3) $CM* $IN; -($JL | $JV | $JT | $H2 | $H3) $CM* $PO; -$PR $CM* ($JL | $JV | $JT | $H2 | $H3); - - -# LB 28 Do not break between alphabetics -# -($ALPlus | $HL) $CM* ($ALPlus | $HL); -^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL - -# LB 29 -$IS $CM* ($ALPlus | $HL); - -# LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); - -# LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; -# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' -# because of the chain-out behavior difference. The rule must chain out only from the [set characters], -# not from the preceding $RI or $CM, which it would be able to do if the set were optional. - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EB $CM* $EM; - -# LB 31 Break everywhere else. -# Match a single code point if no other rule applies. -.; diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 2d72fdfa907..1d46d625c67 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -9,13 +9,10 @@ # Line Breaking Rules # Implement default line breaking as defined by # Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ +# http://www.unicode.org/reports/tr14/, with the following modification: # -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 # # This tailors the line break behavior to correspond to CSS # line-break=loose (BCP47 -u-lb-loose) as defined for languages other than @@ -35,6 +32,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; @@ -240,18 +238,25 @@ $LB18NonBreaks $CM* $QU; # QU x $QU $CM* .; - # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # # DO allow breaks here before NSX, so don't include it $LB20NonBreaks $CM* ($BA | $HY | $NS); + + ^$CM+ ($BA | $HY | $NS); $BB $CM* [^$CB]; # $BB x diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 024e68ebc77..b139da32402 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -8,13 +8,10 @@ # Line Breaking Rules # Implement default line breaking as defined by # Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ +# http://www.unicode.org/reports/tr14/, with the following modification: # -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 # # This tailors the line break behavior to correspond to CSS # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. @@ -42,6 +39,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; @@ -250,18 +248,25 @@ $LB18NonBreaks $CM* $QU; # QU x $QU $CM* .; - # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # # DO allow breaks here before $BAX and $NSX, so don't include them $LB20NonBreaks $CM* ($BA | $HY | $NS); + + ^$CM+ ($BA | $HY | $NS); $BB $CM* [^$CB]; # $BB x diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt deleted file mode 100644 index 0c34b00cf38..00000000000 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -# Copyright (c) 2002-2016 International Business Machines Corporation and -# others. All Rights Reserved. -# -# file: line_loose_fi.txt -# -# Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 3rd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. -# -# This tailors the line break behavior both for Finnish and to correpond to CSS -# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than -# Chinese & Japanese. -# It sets characters of class CJ to behave like ID. -# In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS). -# -# Character Classes defined by TR 14. -# - -!!chain; -!!quoted_literals_only; - -$AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; -$BA = [:LineBreak = Break_After:]; -$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. -$BB = [:LineBreak = Break_Before:]; -$BK = [:LineBreak = Mandatory_Break:]; -$B2 = [:LineBreak = Break_Both:]; -$CB = [:LineBreak = Contingent_Break:]; -$CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [:LineBreak = Close_Punctuation:]; -# $CM = [:LineBreak = Combining_Mark:]; -$CP = [:LineBreak = Close_Parenthesis:]; -$CR = [:LineBreak = Carriage_Return:]; -$EB = [:LineBreak = EB:]; -$EM = [:LineBreak = EM:]; -$EX = [:LineBreak = Exclamation:]; -$GL = [:LineBreak = Glue:]; -$HL = [:LineBreak = Hebrew_Letter:]; -$HY = [:LineBreak = Hyphen:]; -$H2 = [:LineBreak = H2:]; -$H3 = [:LineBreak = H3:]; -# CSS Loose tailoring: CJ resolves to ID -$ID = [[:LineBreak = Ideographic:] $CJ]; -$IN = [:LineBreak = Inseperable:]; -$IS = [:LineBreak = Infix_Numeric:]; -$JL = [:LineBreak = JL:]; -$JV = [:LineBreak = JV:]; -$JT = [:LineBreak = JT:]; -$LF = [:LineBreak = Line_Feed:]; -$NL = [:LineBreak = Next_Line:]; -$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE]; -$NS = [[:LineBreak = Nonstarter:] - $NSX]; -$NU = [:LineBreak = Numeric:]; -$OP = [:LineBreak = Open_Punctuation:]; -$PO = [:LineBreak = Postfix_Numeric:]; -$PR = [:LineBreak = Prefix_Numeric:]; -$QU = [:LineBreak = Quotation:]; -$RI = [:LineBreak = Regional_Indicator:]; -$SA = [:LineBreak = Complex_Context:]; -$SG = [:LineBreak = Surrogate:]; -$SP = [:LineBreak = Space:]; -$SY = [:LineBreak = Break_Symbols:]; -$WJ = [:LineBreak = Word_Joiner:]; -$XX = [:LineBreak = Unknown:]; -$ZW = [:LineBreak = ZWSpace:]; -$ZWJ = [:LineBreak = ZWJ:]; - -# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly -# list it in the numerous rules that use CM. -# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. - -$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; - -# Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context (SA). - -$dictionary = [$SA]; - -# -# Rule LB1. By default, treat AI (characters with ambiguous east Asian width), -# SA (Dictionary chars, excluding Mn and Mc) -# SG (Unpaired Surrogates) -# XX (Unknown, unassigned) -# as $AL (Alphabetic) -# -$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; - - -## ------------------------------------------------- - -# -# CAN_CM is the set of characters that may combine with CM combining chars. -# Note that Linebreak UAX 14's concept of a combining char and the rules -# for what they can combine with are _very_ different from the rest of Unicode. -# -# Note that $CM itself is left out of this set. If CM is needed as a base -# it must be listed separately in the rule. -# -$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs -$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs - -# -# AL_FOLLOW set of chars that can unconditionally follow an AL -# Needed in rules where stand-alone $CM s are treated as AL. -# -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus]; - - -# -# Rule LB 4, 5 Mandatory (Hard) breaks. -# -$LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; -$CR $LF {100}; - -# -# LB 6 Do not break before hard line breaks. -# -$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. -$CAN_CM $CM* $LB4Breaks {100}; -^$CM+ $LB4Breaks {100}; - -# LB 7 x SP -# x ZW -$LB4NonBreaks [$SP $ZW]; -$CAN_CM $CM* [$SP $ZW]; -^$CM+ [$SP $ZW]; - -# -# LB 8 Break after zero width space -# ZW SP* ÷ -# -$LB8Breaks = [$LB4Breaks $ZW]; -$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -$ZW $SP* / [^$SP $ZW $LB4Breaks]; - -# LB 8a ZWJ x Do not break Emoji ZWJ sequences. -# -$ZWJ [^$CM]; - -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL -# See definition of $CAN_CM. - -$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -^$CM+; - -# -# LB 11 Do not break before or after WORD JOINER & related characters. -# -$CAN_CM $CM* $WJ; -$LB8NonBreaks $WJ; -^$CM+ $WJ; - -$WJ $CM* .; - -# -# LB 12 Do not break after NBSP and related characters. -# GL x -# -$GL $CM* .; - -# -# LB 12a Do not break before NBSP and related characters ... -# [^SP BA HY] x GL -# -[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL; -^$CM+ $GL; - - - -# -# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. -# -$LB8NonBreaks $CL; -$CAN_CM $CM* $CL; -^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $CP; -$CAN_CM $CM* $CP; -^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $EX; -$CAN_CM $CM* $EX; -^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $IS; -$CAN_CM $CM* $IS; -^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $SY; -$CAN_CM $CM* $SY; -^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL - - -# -# LB 14 Do not break after OP, even after spaces -# -$OP $CM* $SP* .; - -$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL - # by rule 8, CM following a SP is stand-alone. - -# LB 15 -$QU $CM* $SP* $OP; - -# LB 16 -# Do not break between closing punctuation and $NS, even with intervening spaces -# But DO allow a break between closing punctuation and $NSX, don't include it here -($CL | $CP) $CM* $SP* $NS; - -# LB 17 -$B2 $CM* $SP* $B2; - -# -# LB 18 Break after spaces. -# -$LB18NonBreaks = [$LB8NonBreaks - [$SP]]; -$LB18Breaks = [$LB8Breaks $SP]; - - -# LB 19 -# x QU -$LB18NonBreaks $CM* $QU; -^$CM+ $QU; - -# QU x -$QU $CM* .; - - -# LB 20 -# $CB -# $CB - -$LB20NonBreaks = [$LB18NonBreaks - $CB]; - -# LB 20.09 added rule for Finnish tailoring -# LB 21 x (BA | HY | NS) -# BB x -# -# DO allow breaks here before NSX, so don't include it -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL; -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS); -($HY | $HH) $AL; -^$CM+ ($BA | $HY | $HH | $NS); - -$BB $CM* [^$CB]; # $BB x -$BB $CM* $LB20NonBreaks; - -# LB 21a Don't break after Hebrew + Hyphen -# HL (HY | BA) x -# -$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?; - -# LB 21b (forward) Don't break between SY and HL -# (break between HL and SY already disallowed by LB 13 above) -$SY $CM* $HL; - -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -# $IN $CM* $IN; # delete this rule for CSS loose -$NU $CM* $IN; - - -# $LB 23 -# -($ALPlus | $HL) $CM* $NU; -^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL -$NU $CM* ($ALPlus | $HL); - -# LB 23a -# -$PR $CM* ($ID | $EB | $EM); -($ID | $EB | $EM) $CM* $PO; - - -# -# LB 24 -# -($PR | $PO) $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* ($PR | $PO); -^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL - -# -# LB 25 Numbers. -# -(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))* - ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?; - -# LB 26 Do not break a Korean syllable -# -$JL $CM* ($JL | $JV | $H2 | $H3); -($JV | $H2) $CM* ($JV | $JT); -($JT | $H3) $CM* $JT; - -# LB 27 Treat korean Syllable Block the same as ID (don't break it) -($JL | $JV | $JT | $H2 | $H3) $CM* $IN; -($JL | $JV | $JT | $H2 | $H3) $CM* $PO; -$PR $CM* ($JL | $JV | $JT | $H2 | $H3); - - -# LB 28 Do not break between alphabetics -# -($ALPlus | $HL) $CM* ($ALPlus | $HL); -^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL - -# LB 29 -$IS $CM* ($ALPlus | $HL); - -# LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); - -# LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; -# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' -# because of the chain-out behavior difference. The rule must chain out only from the [set characters], -# not from the preceding $RI or $CM, which it would be able to do if the set were optional. - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EB $CM* $EM; - -# LB 31 Break everywhere else. -# Match a single code point if no other rule applies. -.; diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index b2472177e49..b753a4c0011 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -8,13 +8,10 @@ # Line Breaking Rules # Implement default line breaking as defined by # Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ +# http://www.unicode.org/reports/tr14/, with the following modification: # -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for languages other than @@ -31,6 +28,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; @@ -233,17 +231,24 @@ $LB18NonBreaks $CM* $QU; # QU x $QU $CM* .; - # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # $LB20NonBreaks $CM* ($BA | $HY | $NS); + + ^$CM+ ($BA | $HY | $NS); $BB $CM* [^$CB]; # $BB x diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index b4fcf029e72..4eb8929c982 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -8,13 +8,10 @@ # Line Breaking Rules # Implement default line breaking as defined by # Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ +# http://www.unicode.org/reports/tr14/, with the following modification: # -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. @@ -33,6 +30,7 @@ $AI = [:LineBreak = Ambiguous:]; $AL = [:LineBreak = Alphabetic:]; $BAX = [\u2010 \u2013]; $BA = [[:LineBreak = Break_After:] - $BAX]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; @@ -238,18 +236,25 @@ $LB18NonBreaks $CM* $QU; # QU x $QU $CM* .; - # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # -# DO allow breaks here before $BAXcm and $NSXcm, so don't include them +# DO allow breaks here before $BAX and $NSX, so don't include them $LB20NonBreaks $CM* ($BA | $HY | $NS); + + ^$CM+ ($BA | $HY | $NS); $BB $CM* [^$CB]; # $BB x diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt deleted file mode 100644 index a3eccf2c5b6..00000000000 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (C) 2016 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html -# Copyright (c) 2002-2016 International Business Machines Corporation and -# others. All Rights Reserved. -# -# file: line_normal_fi.txt -# -# Line Breaking Rules -# Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 40 for Unicode 11.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 3rd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. -# -# This tailors the line break behavior for Finnish, and to correspond to CSS -# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than -# Chinese & Japanese. -# It sets characters of class CJ to behave like ID. - -# -# Character Classes defined by TR 14. -# - -!!chain; -!!quoted_literals_only; - -$AI = [:LineBreak = Ambiguous:]; -$AL = [:LineBreak = Alphabetic:]; -$BA = [:LineBreak = Break_After:]; -$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. -$BB = [:LineBreak = Break_Before:]; -$BK = [:LineBreak = Mandatory_Break:]; -$B2 = [:LineBreak = Break_Both:]; -$CB = [:LineBreak = Contingent_Break:]; -$CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [:LineBreak = Close_Punctuation:]; -# $CM = [:LineBreak = Combining_Mark:]; -$CP = [:LineBreak = Close_Parenthesis:]; -$CR = [:LineBreak = Carriage_Return:]; -$EB = [:LineBreak = EB:]; -$EM = [:LineBreak = EM:]; -$EX = [:LineBreak = Exclamation:]; -$GL = [:LineBreak = Glue:]; -$HL = [:LineBreak = Hebrew_Letter:]; -$HY = [:LineBreak = Hyphen:]; -$H2 = [:LineBreak = H2:]; -$H3 = [:LineBreak = H3:]; -# CSS Normal tailoring: CJ resolves to ID -$ID = [[:LineBreak = Ideographic:] $CJ]; -$IN = [:LineBreak = Inseperable:]; -$IS = [:LineBreak = Infix_Numeric:]; -$JL = [:LineBreak = JL:]; -$JV = [:LineBreak = JV:]; -$JT = [:LineBreak = JT:]; -$LF = [:LineBreak = Line_Feed:]; -$NL = [:LineBreak = Next_Line:]; -$NS = [:LineBreak = Nonstarter:]; -$NU = [:LineBreak = Numeric:]; -$OP = [:LineBreak = Open_Punctuation:]; -$PO = [:LineBreak = Postfix_Numeric:]; -$PR = [:LineBreak = Prefix_Numeric:]; -$QU = [:LineBreak = Quotation:]; -$RI = [:LineBreak = Regional_Indicator:]; -$SA = [:LineBreak = Complex_Context:]; -$SG = [:LineBreak = Surrogate:]; -$SP = [:LineBreak = Space:]; -$SY = [:LineBreak = Break_Symbols:]; -$WJ = [:LineBreak = Word_Joiner:]; -$XX = [:LineBreak = Unknown:]; -$ZW = [:LineBreak = ZWSpace:]; -$ZWJ = [:LineBreak = ZWJ:]; - -# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly -# list it in the numerous rules that use CM. -# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. - -$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; - -# Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context (SA). - -$dictionary = [$SA]; - -# -# Rule LB1. By default, treat AI (characters with ambiguous east Asian width), -# SA (Dictionary chars, excluding Mn and Mc) -# SG (Unpaired Surrogates) -# XX (Unknown, unassigned) -# as $AL (Alphabetic) -# -$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; - - -## ------------------------------------------------- - -# -# CAN_CM is the set of characters that may combine with CM combining chars. -# Note that Linebreak UAX 14's concept of a combining char and the rules -# for what they can combine with are _very_ different from the rest of Unicode. -# -# Note that $CM itself is left out of this set. If CM is needed as a base -# it must be listed separately in the rule. -# -$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs -$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs - -# -# AL_FOLLOW set of chars that can unconditionally follow an AL -# Needed in rules where stand-alone $CM s are treated as AL. -# -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus]; - - -# -# Rule LB 4, 5 Mandatory (Hard) breaks. -# -$LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; -$CR $LF {100}; - -# -# LB 6 Do not break before hard line breaks. -# -$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. -$CAN_CM $CM* $LB4Breaks {100}; -^$CM+ $LB4Breaks {100}; - -# LB 7 x SP -# x ZW -$LB4NonBreaks [$SP $ZW]; -$CAN_CM $CM* [$SP $ZW]; -^$CM+ [$SP $ZW]; - -# -# LB 8 Break after zero width space -# ZW SP* ÷ -# -$LB8Breaks = [$LB4Breaks $ZW]; -$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; -$ZW $SP* / [^$SP $ZW $LB4Breaks]; - -# LB 8a ZWJ x Do not break Emoji ZWJ sequences. -# -$ZWJ [^$CM]; - -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL -# See definition of $CAN_CM. - -$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -^$CM+; - -# -# LB 11 Do not break before or after WORD JOINER & related characters. -# -$CAN_CM $CM* $WJ; -$LB8NonBreaks $WJ; -^$CM+ $WJ; - -$WJ $CM* .; - -# -# LB 12 Do not break after NBSP and related characters. -# GL x -# -$GL $CM* .; - -# -# LB 12a Do not break before NBSP and related characters ... -# [^SP BA HY] x GL -# -[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL; -^$CM+ $GL; - - - -# -# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. -# -$LB8NonBreaks $CL; -$CAN_CM $CM* $CL; -^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $CP; -$CAN_CM $CM* $CP; -^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $EX; -$CAN_CM $CM* $EX; -^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $IS; -$CAN_CM $CM* $IS; -^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $SY; -$CAN_CM $CM* $SY; -^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL - - -# -# LB 14 Do not break after OP, even after spaces -# -$OP $CM* $SP* .; - -$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL - # by rule 8, CM following a SP is stand-alone. - -# LB 15 -$QU $CM* $SP* $OP; - -# LB 16 -($CL | $CP) $CM* $SP* $NS; - -# LB 17 -$B2 $CM* $SP* $B2; - -# -# LB 18 Break after spaces. -# -$LB18NonBreaks = [$LB8NonBreaks - [$SP]]; -$LB18Breaks = [$LB8Breaks $SP]; - - -# LB 19 -# x QU -$LB18NonBreaks $CM* $QU; -^$CM+ $QU; - -# QU x -$QU $CM* .; - - -# LB 20 -# $CB -# $CB - -$LB20NonBreaks = [$LB18NonBreaks - $CB]; - -# LB 20.09 added rule for Finnish tailoring -# LB 21 x (BA | HY | NS) -# BB x -# -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL; -$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS); -($HY | $HH) $AL; -^$CM+ ($BA | $HY | $HH | $NS); - -$BB $CM* [^$CB]; # $BB x -$BB $CM* $LB20NonBreaks; - -# LB 21a Don't break after Hebrew + Hyphen -# HL (HY | BA) x -# -$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?; - -# LB 21b (forward) Don't break between SY and HL -# (break between HL and SY already disallowed by LB 13 above) -$SY $CM* $HL; - -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -$IN $CM* $IN; -$NU $CM* $IN; - - -# $LB 23 -# -($ALPlus | $HL) $CM* $NU; -^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL -$NU $CM* ($ALPlus | $HL); - -# LB 23a -# -$PR $CM* ($ID | $EB | $EM); -($ID | $EB | $EM) $CM* $PO; - - -# -# LB 24 -# -($PR | $PO) $CM* ($ALPlus | $HL); -($ALPlus | $HL) $CM* ($PR | $PO); -^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL - -# -# LB 25 Numbers. -# -(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))* - ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?; - -# LB 26 Do not break a Korean syllable -# -$JL $CM* ($JL | $JV | $H2 | $H3); -($JV | $H2) $CM* ($JV | $JT); -($JT | $H3) $CM* $JT; - -# LB 27 Treat korean Syllable Block the same as ID (don't break it) -($JL | $JV | $JT | $H2 | $H3) $CM* $IN; -($JL | $JV | $JT | $H2 | $H3) $CM* $PO; -$PR $CM* ($JL | $JV | $JT | $H2 | $H3); - - -# LB 28 Do not break between alphabetics -# -($ALPlus | $HL) $CM* ($ALPlus | $HL); -^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL - -# LB 29 -$IS $CM* ($ALPlus | $HL); - -# LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); - -# LB 30a Do not break between regional indicators. Break after pairs of them. -# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; -# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' -# because of the chain-out behavior difference. The rule must chain out only from the [set characters], -# not from the preceding $RI or $CM, which it would be able to do if the set were optional. - -# LB 30b Do not break between an Emoji Base and an Emoji Modifier -$EB $CM* $EM; - -# LB 31 Break everywhere else. -# Match a single code point if no other rule applies. -.; diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index fa78413be49..92e6c1af4fd 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -784,15 +784,18 @@ static const int32_t heTestOffs_heFwd[] = { 1, 5, 7, 9, 12, 14, 19, /*static const int32_t heTestOffs_enRev[] = { 22, 19, 17, 14, 12, 9, 7, 5, 1, 0 };*/ static const int32_t heTestOffs_heRev[] = { 19, 14, 12, 9, 7, 5, 1, 0 }; -/* Finnish line break tailoring, for cldrbug 3029 */ +/* Finnish line break tailoring, for cldrbug 3029. + * As of ICU 63, Finnish tailoring moved to root, Finnish and English should be the same. */ static const UChar fiTest[] = { /* 00 */ 0x0020, 0x002D, 0x0031, 0x0032, 0x0020, /* 05 */ 0x0061, 0x002D, 0x006B, 0x0020, /* 09 */ 0x0061, 0x0300, 0x2010, 0x006B, 0x0020, /* 14 */ 0x0061, 0x0020, 0x002D, 0x006B, 0x0020, /* 19 */ 0x0061, 0x0300, 0x0020, 0x2010, 0x006B, 0x0020, 0 }; -static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 17, 19, 22, 23, 25 }; +//static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 17, 19, 22, 23, 25 }; +static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 19, 22, 25 }; static const int32_t fiTestOffs_fiFwd[] = { 1, 5, 7, 9, 12, 14, 16, 19, 22, 25 }; -static const int32_t fiTestOffs_enRev[] = { 23, 22, 19, 17, 16, 14, 12, 9, 7, 5, 1, 0 }; +//static const int32_t fiTestOffs_enRev[] = { 23, 22, 19, 17, 16, 14, 12, 9, 7, 5, 1, 0 }; +static const int32_t fiTestOffs_enRev[] = { 22, 19, 16, 14, 12, 9, 7, 5, 1, 0 }; static const int32_t fiTestOffs_fiRev[] = { 22, 19, 16, 14, 12, 9, 7, 5, 1, 0 }; /* Khmer dictionary-based work break, for ICU ticket #8329 */ diff --git a/icu4c/source/test/intltest/rbbimonkeytest.cpp b/icu4c/source/test/intltest/rbbimonkeytest.cpp index f4fb7016dc3..2a45ae6fad2 100644 --- a/icu4c/source/test/intltest/rbbimonkeytest.cpp +++ b/icu4c/source/test/intltest/rbbimonkeytest.cpp @@ -184,6 +184,14 @@ void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definit } fSetRefsMatcher->appendTail(thisRule->fExpandedRule); + // If rule begins with a '^' rule chaining is disallowed. + // Strip off the '^' from the rule expression, and set the flag. + if (thisRule->fExpandedRule.charAt(0) == u'^') { + thisRule->fInitialMatchOnly = true; + thisRule->fExpandedRule.remove(0, 1); + thisRule->fExpandedRule.trim(); + } + // Replace the divide sign (\u00f7) with a regular expression named capture. // When running the rules, a match that includes this group means we found a break position. @@ -442,6 +450,8 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode // ICU always reports a break there. // The reference rules do not have a means to do so. int32_t strIdx = 0; + bool initialMatch = true; // True at start of text, and immediately after each boundary, + // for control over rule chaining. while (strIdx < fString.length()) { BreakRule *matchingRule = NULL; UBool hasBreak = FALSE; @@ -451,6 +461,10 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode int32_t breakGroup = 0; for (ruleNum=0; ruleNumfBreakRules.size(); ruleNum++) { BreakRule *rule = static_cast(rules->fBreakRules.elementAt(ruleNum)); + if (rule->fInitialMatchOnly && !initialMatch) { + // Skip checking this '^' rule. (No rule chaining) + continue; + } rule->fRuleMatcher->reset(); if (rule->fRuleMatcher->lookingAt(strIdx, status)) { // A candidate rule match, check further to see if we take it or continue to check other rules. @@ -512,10 +526,12 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode // which may differ from end of the match. The matching rule may have included // context following the boundary that needs to be looked at again. strIdx = matchingRule->fRuleMatcher->end(breakGroup, status); + initialMatch = true; } else { // Original rule didn't specify a break. // Continue applying rules starting on the last code point of this match. strIdx = fString.moveIndex32(matchEnd, -1); + initialMatch = false; if (strIdx == matchStart) { // Match was only one code point, no progress if we continue. // Shouldn't get here, case is filtered out at top of loop. diff --git a/icu4c/source/test/intltest/rbbimonkeytest.h b/icu4c/source/test/intltest/rbbimonkeytest.h index 2ddc2bd47ae..54d23fcceae 100644 --- a/icu4c/source/test/intltest/rbbimonkeytest.h +++ b/icu4c/source/test/intltest/rbbimonkeytest.h @@ -102,6 +102,7 @@ class BreakRule: public UObject { UnicodeString fRule; // Rule expression, excluding the name, as written in user source. UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. LocalPointer fRuleMatcher; // Regular expression that matches the rule. + bool fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining. }; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index acf6a57779c..048e2bb8ed5 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1284,7 +1284,8 @@ void RBBITest::TestUnicodeFiles() { // Check for test cases from the Unicode test data files that are known to fail // and should be skipped as known issues because ICU does not fully implement -// the Unicode specifications. +// the Unicode specifications, or because ICU includes tailorings that differ from +// the Unicode standard. // // Test cases are identified by the test data sequence, which tends to be more stable // across Unicode versions than the test file line numbers. @@ -1297,7 +1298,18 @@ UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char * const char *fFileName; const UChar *fString; } badTestCases[] = { - {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration. + {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration. + // Issue 8151, move the Finnish tailoring of the line break of hyphens to root. + // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time + // ICU is out of sync with Unicode. + {"8151", "LineBreakTest.txt", u"-#"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"}, + {"8151", "LineBreakTest.txt", u"\u002d\u00a7"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"}, + {"8151", "LineBreakTest.txt", u"\u002d\U00050005"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0e01"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"}, }; for (int n=0; naddAll(*fSG); // Default behavior for SG is identical to AL. fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. - fCM->addAll(*fZWJ); // ZWJ behaves as a CM. + fCM->addAll(*fZWJ); // ZWJ behaves as a CM. + + fHH->add(u'\u2010'); // Hyphen, '‐' fSets->addElement(fBK, status); fSets->addElement(fCR, status); @@ -3024,6 +3040,15 @@ int32_t RBBILineMonkey::next(int32_t startPos) { break; } + // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen. + // Formerly this was a Finnish tailoring. + // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. + // ^($HY | $HH) $AL; + if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) && + prevPosX2 == -1) { + continue; + } + // LB 21 if (fBA->contains(thisChar) || fHY->contains(thisChar) || @@ -3195,6 +3220,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fB2; delete fBA; delete fBB; + delete fHH; delete fHY; delete fH2; delete fH3; diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index d0f9abe88a1..3e0324bf93d 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -19,6 +19,7 @@ locale = en; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -144,6 +145,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 2384fa296c8..8395192365f 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -26,6 +26,7 @@ locale = en@lb=loose; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -125,7 +126,7 @@ LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; -# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14. # # LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. # LB13.2 SP CM* [CL CP EX IS SY] @@ -152,6 +153,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; @@ -176,7 +180,7 @@ LB23a.2: (ID | EB | EM) CM* PO; LB24.2: (PR | PO) CM* (AL | HL); LB24.3: (AL | HL | CM) CM* (PR | PO); -# Numbers. Equivalent to Tailoring example 8 from UAx 14. +# Numbers. Equivalent to Tailoring example 8 from UAX 14. LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; LB26.1: JL CM* (JL | JV | H2 | H3); diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 8b92561dbd8..d674327102b 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -37,9 +37,10 @@ locale = ja@lb=loose; AI = [:LineBreak = Ambiguous:]; -AL = [[:LineBreak = Alphabetic:]]; +AL = [:LineBreak = Alphabetic:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -169,6 +170,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # LB 21a Don't break after Hebrew + Hyphen diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 65804d83f9b..7f5b91c42ab 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -33,6 +33,7 @@ locale = en@lb=normal; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -158,6 +159,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index b50219282b2..cf90751715c 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -34,6 +34,7 @@ AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -163,6 +164,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so, diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index c830de15545..63ba172233d 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1086,23 +1086,21 @@ Bangkok)• # Finnish line breaking # # These rules deal with hyphens when there is a space on the leading side. -# There should be a break opportunity between the space and the hyphen, and not after the hyphen. +# When followed by a letter, there should be a break opportunity between +# the space and the hyphen, and not after the hyphen. # See CLDR ticket 3029. # See ICU ticket 8151 +# As of ICU 63, the Finnish tailoring behavior is moved to root. -•abc •- •def •abc •-•def •abc- •def •abc-•def• # With ASCII hyphen -•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen +•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen +•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen -# TODO: problems with Finnish line break rules cause these two lines to fail. -#•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen -#•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen - -•abc •- •def •abc •-def •abc- •def • # With ASCII hyphen -•abc •‐ •def •abc •‐def •abc‐ •def • # With Unicode u2010 hyphen +•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen +•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen # Test for #10176 (in fi) diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 7ae8a0ffc2b..198adf2fc9e 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:956147318ffa776ff18c71ab09c5ae63e336e14e240128c8602abf07ef7d7d3f -size 12510547 +oid sha256:36d0ec0c543d1dccafcc6985a7c18285b255afb98bc2bdb16a867a22600bfddb +size 12487287 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 67e57e3ad38..afa08eb95e7 100755 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55923dda88f8bf3affc2cf6d774a92a49e5fbc4be5583769bfe90fc7f319d2b1 +oid sha256:469f76e391dced8e9ae4a9543513dddd6d4d2026ad6cbc0ab79d9553da803e6a size 92857 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java index 3637b688500..88af30de700 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java @@ -60,10 +60,11 @@ public class RBBIMonkeyTest extends TestFmwk { // is compiled to a regular expression. static class BreakRule { - String fName; // Name of the rule. - String fRule; // Rule expression, excluding the name, as written in user source. - String fExpandedRule; // Rule expression after expanding the set definitions. - Matcher fRuleMatcher; // Regular expression that matches the rule. + String fName; // Name of the rule. + String fRule; // Rule expression, excluding the name, as written in user source. + String fExpandedRule; // Rule expression after expanding the set definitions. + Matcher fRuleMatcher; // Regular expression that matches the rule. + boolean fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining. }; @@ -220,6 +221,14 @@ public class RBBIMonkeyTest extends TestFmwk { } fPropertyMatcher.appendTail(expandedRule); + // If rule begins with a '^' rule chaining is disallowed. + // Strip off the '^' from the rule expression, and set the flag. + if (expandedRule.charAt(0) == '^') { + thisRule.fInitialMatchOnly = true; + expandedRule.deleteCharAt(0); + expandedRule = new StringBuffer(expandedRule.toString().trim()); + } + // Replace any [^negated sets] with equivalent flattened sets generated by // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply // to any nested classes. Variable substitution in rules produces @@ -549,6 +558,9 @@ public class RBBIMonkeyTest extends TestFmwk { // ICU always reports a break there. // The reference rules do not have a means to do so. int strIdx = 0; + boolean initialMatch = true; // True at start of text, and immediately after each boundary, + // // for control over rule chaining. + while (strIdx < fString.length()) { BreakRule matchingRule = null; boolean hasBreak = false; @@ -557,6 +569,10 @@ public class RBBIMonkeyTest extends TestFmwk { int matchEnd = 0; for (ruleNum=0; ruleNum UnicodeSet.MAX_VALUE ? @@ -684,6 +686,7 @@ public class RBBITestMonkey extends TestFmwk { fB2 = new XUnicodeSet("[\\p{Line_break=B2}]"); fBA = new XUnicodeSet("[\\p{Line_break=BA}]"); fBB = new XUnicodeSet("[\\p{Line_break=BB}]"); + fHH = new XUnicodeSet(); fHY = new XUnicodeSet("[\\p{Line_break=HY}]"); fCB = new XUnicodeSet("[\\p{Line_break=CB}]"); fCL = new XUnicodeSet("[\\p{Line_break=CL}]"); @@ -728,6 +731,8 @@ public class RBBITestMonkey extends TestFmwk { fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. fCM.addAll(fZWJ); // ZWJ behaves as a CM. + fHH.add('\u2010'); // Hyphen, '‐' + fSets.add(fBK); fSets.add(fCR); fSets.add(fLF); @@ -786,12 +791,14 @@ public class RBBITestMonkey extends TestFmwk { int prevPos; // Index of the char preceding a potential break position int prevChar; // Character at above position. Note that prevChar - // and thisChar may not be adjacent because combining - // characters between them will be ignored. - int prevCharX2; // Character before prevChar, more contex for LB 21a + // // and thisChar may not be adjacent because combining + // // characters between them will be ignored. + + int prevPosX2; + int prevCharX2; // Character before prevChar, more context for LB 21a int nextPos; // Index of the next character following pos. - // Usually skips over combining marks. + // // Usually skips over combining marks. int tPos; // temp value. int matchVals[] = null; // Number Expression Match Results @@ -804,8 +811,8 @@ public class RBBITestMonkey extends TestFmwk { // Initial values for loop. Loop will run the first time without finding breaks, // while the invalid values shift out and the "this" and // "prev" positions are filled in with good values. - pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. - thisChar = prevChar = prevCharX2 = 0; + pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. + thisChar = prevChar = prevCharX2 = 0; nextPos = startPos; @@ -816,6 +823,7 @@ public class RBBITestMonkey extends TestFmwk { // "prevPos" can be arbitrarily far before "pos". for (;;) { // Advance to the next position to be tested. + prevPosX2 = prevPos; prevCharX2 = prevChar; prevPos = pos; prevChar = thisChar; @@ -1066,6 +1074,15 @@ public class RBBITestMonkey extends TestFmwk { break; } + // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen. + // Formerly this was a Finnish tailoring. + // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. + // ^($HY | $HH) $AL; + if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) && + prevPosX2 == -1) { + continue; + } + // LB 21 if (fBA.contains(thisChar) || fHY.contains(thisChar) || diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index d0f9abe88a1..3e0324bf93d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -19,6 +19,7 @@ locale = en; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -144,6 +145,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 2384fa296c8..8395192365f 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -26,6 +26,7 @@ locale = en@lb=loose; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -125,7 +126,7 @@ LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; -# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14. +# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14. # # LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule. # LB13.2 SP CM* [CL CP EX IS SY] @@ -152,6 +153,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; @@ -176,7 +180,7 @@ LB23a.2: (ID | EB | EM) CM* PO; LB24.2: (PR | PO) CM* (AL | HL); LB24.3: (AL | HL | CM) CM* (PR | PO); -# Numbers. Equivalent to Tailoring example 8 from UAx 14. +# Numbers. Equivalent to Tailoring example 8 from UAX 14. LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?; LB26.1: JL CM* (JL | JV | H2 | H3); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 8b92561dbd8..d674327102b 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -37,9 +37,10 @@ locale = ja@lb=loose; AI = [:LineBreak = Ambiguous:]; -AL = [[:LineBreak = Alphabetic:]]; +AL = [:LineBreak = Alphabetic:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -169,6 +170,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # LB 21a Don't break after Hebrew + Hyphen diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 65804d83f9b..7f5b91c42ab 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -33,6 +33,7 @@ locale = en@lb=normal; AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BA = [:LineBreak = Break_After:]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -158,6 +159,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. LB21a: HL CM* (HY | BA) CM* [^CM CB]; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index b50219282b2..cf90751715c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -34,6 +34,7 @@ AI = [:LineBreak = Ambiguous:]; AL = [:LineBreak = Alphabetic:]; BAX = [\u2010 \u2013]; BA = [[:LineBreak = Break_After:] - BAX]; +HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. BB = [:LineBreak = Break_Before:]; BK = [:LineBreak = Mandatory_Break:]; B2 = [:LineBreak = Break_Both:]; @@ -163,6 +164,9 @@ LB20.2: . CM* ÷ CB; LB20.3: CB CM* ZWJ [^CM]; LB20.4: CB CM* ÷; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +LB20.09: ^(HY | HH) CM* AL; + # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so, diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index c830de15545..63ba172233d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1086,23 +1086,21 @@ Bangkok)• # Finnish line breaking # # These rules deal with hyphens when there is a space on the leading side. -# There should be a break opportunity between the space and the hyphen, and not after the hyphen. +# When followed by a letter, there should be a break opportunity between +# the space and the hyphen, and not after the hyphen. # See CLDR ticket 3029. # See ICU ticket 8151 +# As of ICU 63, the Finnish tailoring behavior is moved to root. -•abc •- •def •abc •-•def •abc- •def •abc-•def• # With ASCII hyphen -•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen +•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen +•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen -# TODO: problems with Finnish line break rules cause these two lines to fail. -#•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen -#•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen - -•abc •- •def •abc •-def •abc- •def • # With ASCII hyphen -•abc •‐ •def •abc •‐def •abc‐ •def • # With Unicode u2010 hyphen +•abc •- •def •abc •-def •abc- •def •abc-•def• # With ASCII hyphen +•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def• # With Unicode u2010 hyphen # Test for #10176 (in fi)