From: Andy Heninger Date: Thu, 24 Feb 2022 00:48:24 +0000 (-0800) Subject: ICU-21592 Linebreak loose cj rules cleanup X-Git-Tag: release-71-rc~43 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f783a84d2f86781b9dd6bf38cd99e05604fb89ce;p=icu ICU-21592 Linebreak loose cj rules cleanup This is a followup to PR #1991, Update cj normal/loose linebreak per CSS The original change to the line_loose_cj rules involved splitting hyphens out of the BA (Break After) class, allowing a break when they follow an ID. This change simplifies the the rules for doing that. It also fixes a problem with the original change that had altered the behavior of BAX hyphens that followed Regional Indicators or Unattached Combining Marks. --- diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index e921a94c290..0be1e7cb0c8 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -136,7 +136,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus]; # @@ -301,7 +301,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; $ID $CM* ($BA | $HY | $NS); -^$CM+ ($BA | $HY | $NS); +^$CM+ ($BA | $BAX | $HY | $NS); $BB $CM* [^$CB]; # $BB x $BB $CM* $LB20NonBreaks; @@ -377,9 +377,9 @@ $CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index 43d116a98c9..8b37d5acb36 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -149,7 +149,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus]; # @@ -314,7 +314,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; $ID $CM* ($BA | $HY | $NS); -^$CM+ ($BA | $HY | $NS); +^$CM+ ($BA | $BAX | $HY | $NS); $BB $CM* [^$CB]; # $BB x $BB $CM* $LB20NonBreaks; @@ -390,9 +390,9 @@ $CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 7d1a02570b5..9d7b2cc12c6 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -201,10 +201,9 @@ LB20.09: ^(HY | HH) CM* AL; LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; -LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS]; +LB21.1: [^ID] CM* [BA BAX HY NS]; LB21.2: ID CM* [BA HY NS]; -LB21.3: CM+ [BA HY NS]; -LB21.4: BB CM* [^CM CB]; +LB21.3: BB CM* [^CM CB]; LB21b: SY CM* HL; @@ -239,9 +238,9 @@ LB30.1: (AL | CM | HL | NU) CM* OP30; LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA BAX HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA BAX HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA BAX HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base (or potential emoji) and Emoji Modifier LB30b.1: EB CM* EM; diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index c23c3ad9655..4f22b6b1a57 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35c38cf97fab980cad7969d53b4fab0c3b69fbcd5dce66bceb2dc2b0834161af -size 13873077 +oid sha256:8635d4830fc22e3f61ba2949cc9100c41b994bfb99de1d77b218e670487fc3da +size 13873033 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index 16a1b92ef89..6ae6ff8d6b5 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68681037898386a47e5dd2bd9765485e012ce23f2cab83c0f698b9e6ee302c08 -size 96439 +oid sha256:a7f7990c8f99336d5fa2f8b3a42814fe2c64d5908f7b74cbba7130969de2705c +size 96435 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 7d1a02570b5..9d7b2cc12c6 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -201,10 +201,9 @@ LB20.09: ^(HY | HH) CM* AL; LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?; -LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS]; +LB21.1: [^ID] CM* [BA BAX HY NS]; LB21.2: ID CM* [BA HY NS]; -LB21.3: CM+ [BA HY NS]; -LB21.4: BB CM* [^CM CB]; +LB21.3: BB CM* [^CM CB]; LB21b: SY CM* HL; @@ -239,9 +238,9 @@ LB30.1: (AL | CM | HL | NU) CM* OP30; LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA BAX HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA BAX HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA BAX HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base (or potential emoji) and Emoji Modifier LB30b.1: EB CM* EM;