From: Frank Tang Date: Wed, 1 Dec 2021 23:39:35 +0000 (+0000) Subject: ICU-21847 Move UnicodeSet to stack in constructor X-Git-Tag: cldr/2022-02-08~20 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=704415402a68be8bbc143e5c78c3102b0d40ba90;p=icu ICU-21847 Move UnicodeSet to stack in constructor See #1941 --- diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 4d158e3226d..36a35c411a9 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -199,13 +199,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); - fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); + UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fThaiWordSet; + fEndWordSet = thaiWordSet; fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK @@ -441,13 +441,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); - fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); + UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fLaoWordSet; + fEndWordSet = laoWordSet; fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) @@ -637,14 +637,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); - fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); + fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.add(0x0020); if (U_SUCCESS(status)) { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); - fEndWordSet = fBurmeseWordSet; - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching. fMarkSet.compact(); @@ -830,13 +829,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); + UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fKhmerWordSet; + fEndWordSet = khmerWordSet; fBeginWordSet.add(0x1780, 0x17B3); //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -1050,24 +1049,19 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType : DictionaryBreakEngine(), fDictionary(adoptDictionary) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); - // Korean dictionary only includes Hangul syllables - fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); - fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); - fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); - fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); nfkcNorm2 = Normalizer2::getNFKCInstance(status); + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); + fHangulWordSet.compact(); - if (U_SUCCESS(status)) { - // handle Korean and Japanese/Chinese using different dictionaries - if (type == kKorean) { + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); - } else { //Chinese and Japanese - UnicodeSet cjSet; - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + } + } else { //Chinese and Japanese + UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + if (U_SUCCESS(status)) { setCharacters(cjSet); } } diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h index 4e70ed38171..da8a8a13a22 100644 --- a/icu4c/source/common/dictbe.h +++ b/icu4c/source/common/dictbe.h @@ -127,7 +127,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fThaiWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; @@ -186,7 +185,6 @@ class LaoBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fLaoWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -244,7 +242,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fBurmeseWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -302,7 +299,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fKhmerWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -366,9 +362,6 @@ class CjkBreakEngine : public DictionaryBreakEngine { * @internal */ UnicodeSet fHangulWordSet; - UnicodeSet fHanWordSet; - UnicodeSet fKatakanaWordSet; - UnicodeSet fHiraganaWordSet; DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java index 138324451ed..e9f0299c765 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java @@ -31,24 +31,16 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine { private static final byte BURMESE_MIN_WORD = 2; private DictionaryMatcher fDictionary; - private static UnicodeSet fBurmeseWordSet; - private static UnicodeSet fEndWordSet; - private static UnicodeSet fBeginWordSet; - private static UnicodeSet fMarkSet; + private UnicodeSet fEndWordSet; + private UnicodeSet fBeginWordSet; + private UnicodeSet fMarkSet; - static { + public BurmeseBreakEngine() throws IOException { // Initialize UnicodeSets - fBurmeseWordSet = new UnicodeSet(); - fMarkSet = new UnicodeSet(); - fBeginWordSet = new UnicodeSet(); - - fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]"); - fBurmeseWordSet.compact(); - - fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"); + fBeginWordSet = new UnicodeSet(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]]"); + fMarkSet = new UnicodeSet("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.add(0x0020); - fEndWordSet = new UnicodeSet(fBurmeseWordSet); - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching fMarkSet.compact(); @@ -56,14 +48,11 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine { fBeginWordSet.compact(); // Freeze the static UnicodeSet - fBurmeseWordSet.freeze(); fMarkSet.freeze(); fEndWordSet.freeze(); fBeginWordSet.freeze(); - } - public BurmeseBreakEngine() throws IOException { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); // Initialize dictionary fDictionary = DictionaryData.loadDictionaryFor("Mymr"); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java index b7ac028f2ea..a14c745e509 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java @@ -20,36 +20,18 @@ import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UnicodeSet; public class CjkBreakEngine extends DictionaryBreakEngine { - private static final UnicodeSet fHangulWordSet = new UnicodeSet(); - private static final UnicodeSet fHanWordSet = new UnicodeSet(); - private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); - private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); - static { - fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); - fHanWordSet.applyPattern("[:Han:]"); - fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); - fHiraganaWordSet.applyPattern("[:Hiragana:]"); - - // freeze them all - fHangulWordSet.freeze(); - fHanWordSet.freeze(); - fKatakanaWordSet.freeze(); - fHiraganaWordSet.freeze(); - } - + private UnicodeSet fHangulWordSet; private DictionaryMatcher fDictionary = null; public CjkBreakEngine(boolean korean) throws IOException { + fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]"); + fHangulWordSet.freeze(); + fDictionary = DictionaryData.loadDictionaryFor("Hira"); if (korean) { setCharacters(fHangulWordSet); } else { //Chinese and Japanese - UnicodeSet cjSet = new UnicodeSet(); - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"); setCharacters(cjSet); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java index 35915aea2e5..02401f8ed67 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java @@ -33,24 +33,20 @@ public class KhmerBreakEngine extends DictionaryBreakEngine { private DictionaryMatcher fDictionary; - private static UnicodeSet fKhmerWordSet; - private static UnicodeSet fEndWordSet; - private static UnicodeSet fBeginWordSet; - private static UnicodeSet fMarkSet; + private UnicodeSet fEndWordSet; + private UnicodeSet fBeginWordSet; + private UnicodeSet fMarkSet; - static { + public KhmerBreakEngine() throws IOException { // Initialize UnicodeSets - fKhmerWordSet = new UnicodeSet(); - fMarkSet = new UnicodeSet(); - fBeginWordSet = new UnicodeSet(); + UnicodeSet khmerWordSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]]"); + fMarkSet = new UnicodeSet("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]"); + fMarkSet.add(0x0020); + fBeginWordSet = new UnicodeSet(0x1780, 0x17B3); - fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]"); - fKhmerWordSet.compact(); + khmerWordSet.compact(); - fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]"); - fMarkSet.add(0x0020); - fEndWordSet = new UnicodeSet(fKhmerWordSet); - fBeginWordSet.add(0x1780, 0x17B3); + fEndWordSet = new UnicodeSet(khmerWordSet); fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters // Compact for caching @@ -59,14 +55,12 @@ public class KhmerBreakEngine extends DictionaryBreakEngine { fBeginWordSet.compact(); // Freeze the static UnicodeSet - fKhmerWordSet.freeze(); + khmerWordSet.freeze(); fMarkSet.freeze(); fEndWordSet.freeze(); fBeginWordSet.freeze(); - } - public KhmerBreakEngine() throws IOException { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); // Initialize dictionary fDictionary = DictionaryData.loadDictionaryFor("Khmr"); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java index ce09b58ee5e..95a8ef3762e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java @@ -30,27 +30,24 @@ public class LaoBreakEngine extends DictionaryBreakEngine { private static final byte LAO_MIN_WORD = 2; private DictionaryMatcher fDictionary; - private static UnicodeSet fLaoWordSet; - private static UnicodeSet fEndWordSet; - private static UnicodeSet fBeginWordSet; - private static UnicodeSet fMarkSet; + private UnicodeSet fEndWordSet; + private UnicodeSet fBeginWordSet; + private UnicodeSet fMarkSet; - static { + public LaoBreakEngine() throws IOException { // Initialize UnicodeSets - fLaoWordSet = new UnicodeSet(); - fMarkSet = new UnicodeSet(); - fBeginWordSet = new UnicodeSet(); + UnicodeSet laoWordSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]]"); + fMarkSet = new UnicodeSet("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"); + fMarkSet.add(0x0020); + fBeginWordSet = new UnicodeSet( + 0x0E81, 0x0EAE, // basic consonants (including holes for corresponding Thai characters) + 0x0EC0, 0x0EC4, // prefix vowels + 0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) - fLaoWordSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]]"); - fLaoWordSet.compact(); + laoWordSet.compact(); - fMarkSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"); - fMarkSet.add(0x0020); - fEndWordSet = new UnicodeSet(fLaoWordSet); + fEndWordSet = new UnicodeSet(laoWordSet); fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels - fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) - fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) - fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels // Compact for caching fMarkSet.compact(); @@ -58,14 +55,12 @@ public class LaoBreakEngine extends DictionaryBreakEngine { fBeginWordSet.compact(); // Freeze the static UnicodeSet - fLaoWordSet.freeze(); + laoWordSet.freeze(); fMarkSet.freeze(); fEndWordSet.freeze(); fBeginWordSet.freeze(); - } - public LaoBreakEngine() throws IOException { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); // Initialize dictionary fDictionary = DictionaryData.loadDictionaryFor("Laoo"); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java index 1322a8a51b2..71ba5096e5a 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java @@ -36,31 +36,27 @@ public class ThaiBreakEngine extends DictionaryBreakEngine { private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; private DictionaryMatcher fDictionary; - private static UnicodeSet fThaiWordSet; - private static UnicodeSet fEndWordSet; - private static UnicodeSet fBeginWordSet; - private static UnicodeSet fSuffixSet; - private static UnicodeSet fMarkSet; + private UnicodeSet fEndWordSet; + private UnicodeSet fBeginWordSet; + private UnicodeSet fSuffixSet; + private UnicodeSet fMarkSet; - static { + public ThaiBreakEngine() throws IOException { // Initialize UnicodeSets - fThaiWordSet = new UnicodeSet(); - fMarkSet = new UnicodeSet(); - fBeginWordSet = new UnicodeSet(); + UnicodeSet thaiWordSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]]"); + fMarkSet = new UnicodeSet("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"); + fMarkSet.add(0x0020); + fBeginWordSet = new UnicodeSet(0x0E01, 0x0E2E, //KO KAI through HO NOKHUK + 0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fSuffixSet = new UnicodeSet(); + fSuffixSet.add(THAI_PAIYANNOI); + fSuffixSet.add(THAI_MAIYAMOK); - fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]"); - fThaiWordSet.compact(); + thaiWordSet.compact(); - fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"); - fMarkSet.add(0x0020); - fEndWordSet = new UnicodeSet(fThaiWordSet); + fEndWordSet = new UnicodeSet(thaiWordSet); fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI - fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK - fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI - fSuffixSet.add(THAI_PAIYANNOI); - fSuffixSet.add(THAI_MAIYAMOK); // Compact for caching fMarkSet.compact(); @@ -69,15 +65,13 @@ public class ThaiBreakEngine extends DictionaryBreakEngine { fSuffixSet.compact(); // Freeze the static UnicodeSet - fThaiWordSet.freeze(); + thaiWordSet.freeze(); fMarkSet.freeze(); fEndWordSet.freeze(); fBeginWordSet.freeze(); fSuffixSet.freeze(); - } - public ThaiBreakEngine() throws IOException { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); // Initialize dictionary fDictionary = DictionaryData.loadDictionaryFor("Thai"); }