From e220fe9dd65ca58ae2a3bbb38ded46fba01d38fb Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 26 Sep 2017 20:16:03 +0000 Subject: [PATCH] ICU-9954 rbbi forward iteration performance improvement. Ensure rules always make some progress; keep engine out of the fail-safe fall back path for rules that do not advance. X-SVN-Rev: 40461 --- icu4c/source/data/brkitr/rules/char.txt | 3 +++ icu4c/source/data/brkitr/rules/line.txt | 3 +++ icu4c/source/data/brkitr/rules/line_fi.txt | 4 ++++ icu4c/source/data/brkitr/rules/line_loose.txt | 3 +++ icu4c/source/data/brkitr/rules/line_loose_cj.txt | 4 ++++ icu4c/source/data/brkitr/rules/line_loose_fi.txt | 3 +++ icu4c/source/data/brkitr/rules/line_normal.txt | 4 ++++ icu4c/source/data/brkitr/rules/line_normal_cj.txt | 4 ++++ icu4c/source/data/brkitr/rules/line_normal_fi.txt | 4 ++++ icu4c/source/data/brkitr/rules/word.txt | 4 ++++ icu4c/source/data/brkitr/rules/word_POSIX.txt | 4 ++++ 11 files changed, 40 insertions(+) diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index ad070638aa3..1bfdef637a9 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -78,6 +78,9 @@ $Prepend [^$Control $CR $LF]; ^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; ^$Prepend* $Regional_Indicator $Regional_Indicator; +# GB 999 Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 902ca8bfea6..756b19d74f6 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -335,6 +335,9 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/line_fi.txt b/icu4c/source/data/brkitr/rules/line_fi.txt index 73cc9edbd01..5d0f666b830 100644 --- a/icu4c/source/data/brkitr/rules/line_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_fi.txt @@ -344,6 +344,10 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 0e39e9cc3ff..ba18f1cd2c4 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -347,6 +347,9 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 4b452de1f5b..48791c4d5eb 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -361,6 +361,10 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/line_loose_fi.txt b/icu4c/source/data/brkitr/rules/line_loose_fi.txt index f68057bfebf..cc09db7969d 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_fi.txt @@ -346,6 +346,9 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index 594cb165475..bc417cf567f 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -339,6 +339,10 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 59c706d2562..5dbcd85ad40 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -345,6 +345,10 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/line_normal_fi.txt b/icu4c/source/data/brkitr/rules/line_normal_fi.txt index b6e10b90be4..5cd5553605f 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_fi.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_fi.txt @@ -342,6 +342,10 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); # LB 30b Do not break between an Emoji Base and an Emoji Modifier $EB $CM* $EM; +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- !!safe_reverse; diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index 7c4e9a39a64..617205debc2 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -194,6 +194,10 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b) $HangulSyllable $HangulSyllable {200}; $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- diff --git a/icu4c/source/data/brkitr/rules/word_POSIX.txt b/icu4c/source/data/brkitr/rules/word_POSIX.txt index ec46da62915..5ea6a05ce4b 100644 --- a/icu4c/source/data/brkitr/rules/word_POSIX.txt +++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt @@ -194,6 +194,10 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b) $HangulSyllable $HangulSyllable {200}; $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; + ## ------------------------------------------------- -- 2.40.0