From b6b7b045e9cef2c942efd267bb89c5a545017f0c Mon Sep 17 00:00:00 2001 From: Shuhei Iitsuka Date: Fri, 29 Jul 2022 12:08:01 +0800 Subject: [PATCH] ICU-22100 Incorporate BudouX into ICU (C++) --- .github/adaboost.json | 14 + .github/workflows/icu_ci.yml | 11 + icu4c/source/common/BUILD.bazel | 1 + icu4c/source/common/common.vcxproj | 2 + icu4c/source/common/common.vcxproj.filters | 6 + icu4c/source/common/common_uwp.vcxproj | 2 + icu4c/source/common/dictbe.cpp | 25 +- icu4c/source/common/dictbe.h | 4 + icu4c/source/common/mlbe.cpp | 452 +++++++++ icu4c/source/common/mlbe.h | 152 +++ icu4c/source/common/sources.txt | 1 + icu4c/source/common/unicode/uconfig.h | 10 + icu4c/source/data/BUILDRULES.py | 29 +- icu4c/source/data/brkitr/adaboost/jaml.txt | 940 ++++++++++++++++++ .../python/icutools/databuilder/filtration.py | 4 +- icu4c/source/test/depstest/dependencies.txt | 2 +- icu4c/source/test/intltest/rbbitst.cpp | 20 + icu4c/source/test/testdata/rbbitst.txt | 24 +- 18 files changed, 1690 insertions(+), 9 deletions(-) create mode 100644 .github/adaboost.json create mode 100644 icu4c/source/common/mlbe.cpp create mode 100644 icu4c/source/common/mlbe.h create mode 100644 icu4c/source/data/brkitr/adaboost/jaml.txt diff --git a/.github/adaboost.json b/.github/adaboost.json new file mode 100644 index 00000000000..639fd6a99da --- /dev/null +++ b/.github/adaboost.json @@ -0,0 +1,14 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +// +// Include Japanese adaboost model. +{ + "featureFilters": { + "brkitr_adaboost": { + "includelist": [ + "jaml" + ] + } + } +} diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml index 90bce1ed542..1293e5edbb0 100644 --- a/.github/workflows/icu_ci.yml +++ b/.github/workflows/icu_ci.yml @@ -334,6 +334,17 @@ jobs: make clean; make -j2 check + # Test adaboost + adaboost-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: | + cd icu4c/source; + ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex; + make clean; + make -j2 check + # Build and run testmap testmap: runs-on: ubuntu-latest diff --git a/icu4c/source/common/BUILD.bazel b/icu4c/source/common/BUILD.bazel index e385d3b243f..47d3d24bf51 100644 --- a/icu4c/source/common/BUILD.bazel +++ b/icu4c/source/common/BUILD.bazel @@ -342,6 +342,7 @@ cc_library( "dictionarydata.cpp", "filteredbrk.cpp", "lstmbe.cpp", + "mlbe.cpp", "rbbi.cpp", "rbbi_cache.cpp", "rbbidata.cpp", diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index e35e1b0cff7..2b4cc05357a 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -88,6 +88,7 @@ + @@ -282,6 +283,7 @@ + diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 38bc0c1b869..28a5d903429 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -76,6 +76,9 @@ break iteration + + break iteration + break iteration @@ -660,6 +663,9 @@ break iteration + + break iteration + break iteration diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj index fc165629f29..5df0d57a7de 100644 --- a/icu4c/source/common/common_uwp.vcxproj +++ b/icu4c/source/common/common_uwp.vcxproj @@ -222,6 +222,7 @@ + @@ -417,6 +418,7 @@ + diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 9b5434d995a..0e420c67c5d 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1054,9 +1054,10 @@ foundBest: */ static const uint32_t kuint32max = 0xFFFFFFFF; CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) -: DictionaryBreakEngine(), fDictionary(adoptDictionary) { +: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); + fMlBreakEngine = nullptr; nfkcNorm2 = Normalizer2::getNFKCInstance(status); // Korean dictionary only includes Hangul syllables fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); @@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); } - } else { //Chinese and Japanese + } else { // Chinese and Japanese UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + isCj = true; if (U_SUCCESS(status)) { setCharacters(cjSet); +#if UCONFIG_USE_ML_PHRASE_BREAKING + fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet, + fClosePunctuationSet, status); + if (fMlBreakEngine == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + } +#else initJapanesePhraseParameter(status); +#endif } } UTRACE_EXIT_STATUS(status); @@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType CjkBreakEngine::~CjkBreakEngine(){ delete fDictionary; + delete fMlBreakEngine; } // The katakanaCost values below are based on the length frequencies of all @@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } } } - + +#if UCONFIG_USE_ML_PHRASE_BREAKING + // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja. + if (isPhraseBreaking && isCj) { + return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString, + inputMap, status); + } +#endif + // bestSnlp[i] is the snlp of the best segmentation of the first i // code points in the range to be matched. UVector32 bestSnlp(numCodePts + 1, status); diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h index ca1a3c28b7b..a2c761bdc3a 100644 --- a/icu4c/source/common/dictbe.h +++ b/icu4c/source/common/dictbe.h @@ -16,11 +16,13 @@ #include "brkeng.h" #include "hash.h" +#include "mlbe.h" #include "uvectr32.h" U_NAMESPACE_BEGIN class DictionaryMatcher; +class MlBreakEngine; class Normalizer2; /******************************************************************* @@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine { DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; + MlBreakEngine *fMlBreakEngine; + bool isCj; private: // Load Japanese extensions. diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp new file mode 100644 index 00000000000..3ccf470e5b1 --- /dev/null +++ b/icu4c/source/common/mlbe.cpp @@ -0,0 +1,452 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_BREAK_ITERATION + +#include "cmemory.h" +#include "mlbe.h" +#include "uassert.h" +#include "ubrkimpl.h" +#include "unicode/resbund.h" +#include "unicode/udata.h" +#include "unicode/utf16.h" +#include "uresimp.h" +#include "util.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +Element::Element() : length(0) {} + +void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) { + character = ch; + U_ASSERT(idx.length() <= 3); + length = idx.length(); + idx.extract(0, length, ublock); + ublock[length] = '\0'; +} + +UChar32 Element::getCharacter() const { + return character; +} + +char16_t* Element::getUblock() const { + return (char16_t*)ublock; +} + +uint16_t Element::getLength() const { + return length; +} + +MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, + const UnicodeSet &closePunctuationSet, UErrorCode &status) + : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet), + fClosePunctuationSet(closePunctuationSet), + fModel(status), + fNegativeSum(0) { + if (U_FAILURE(status)) { + return; + } + loadMLModel(status); +} + +MlBreakEngine::~MlBreakEngine() {} + +namespace { + const char16_t INVALID = u'|'; + const int32_t MAX_FEATURE = 26; + const int32_t MAX_FEATURE_LENGTH = 14; + + bool isValid(const Element& element) { + return element.getLength() != 1 || element.getUblock()[0] != INVALID; + } + + void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + UnicodeString result(str); + for (int i = 0; i < length; i++) { + result.append(arr[i]); + } + U_ASSERT(result.length() < MAX_FEATURE_LENGTH); + result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates + } + + void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) { + U_ASSERT(str.length() < MAX_FEATURE_LENGTH); + str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates + } +} + +int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd, + UVector32 &foundBreaks, const UnicodeString &inString, + const LocalPointer &inputMap, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + if (rangeStart >= rangeEnd) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UVector32 boundary(inString.countChar32() + 1, status); + if (U_FAILURE(status)) { + return 0; + } + int32_t numBreaks = 0; + UChar32 ch; + UnicodeString index; + // The ML model groups six char to evaluate if the 4th char is a breakpoint. + // Like a sliding window, the elementList removes the first char and appends the new char from + // inString in each iteration so that its size always remains at six. + Element elementList[6]; + + int32_t codeUts = initElementList(inString, elementList, status); + int32_t length = inString.countChar32(); + + // Add a break for the start. + boundary.addElement(0, status); + numBreaks++; + if (U_FAILURE(status)) return 0; + + for (int32_t i = 1; i < length && U_SUCCESS(status); i++) { + evaluateBreakpoint(elementList, i, numBreaks, boundary, status); + if (i + 1 >= inString.countChar32()) break; + // Remove the first element and append a new element + uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element)); + ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID; + index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID); + elementList[5].setCharAndUblock(ch, index); + if (ch != INVALID) { + codeUts += U16_LENGTH(ch); + } + } + if (U_FAILURE(status)) return 0; + + // Add a break for the end if there is not one there already. + if (boundary.lastElementi() != inString.countChar32()) { + boundary.addElement(inString.countChar32(), status); + numBreaks++; + } + + int32_t prevCPPos = -1; + int32_t prevUTextPos = -1; + int32_t correctedNumBreaks = 0; + for (int32_t i = 0; i < numBreaks; i++) { + int32_t cpPos = boundary.elementAti(i); + int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; + U_ASSERT(cpPos > prevCPPos); + U_ASSERT(utextPos >= prevUTextPos); + + if (utextPos > prevUTextPos) { + if (utextPos != rangeStart || + (utextPos > 0 && + fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } + } else { + // Normalization expanded the input text, the dictionary found a boundary + // within the expansion, giving two boundaries with the same index in the + // original text. Ignore the second. See ticket #12918. + --numBreaks; + } + prevCPPos = cpPos; + prevUTextPos = utextPos; + } + (void)prevCPPos; // suppress compiler warnings about unused variable + + UChar32 nextChar = utext_char32At(inText, rangeEnd); + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { + foundBreaks.popi(); + correctedNumBreaks--; + } + } + + return correctedNumBreaks; +} + +void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks, + UVector32 &boundary, UErrorCode &status) const { + char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH]; + if (U_FAILURE(status)) { + return; + } + + UChar32 arr[4] = {-1, -1, -1, -1}; + int32_t length = 0, listLength = 0; + + const UChar32 w1 = elementList[0].getCharacter(); + const UChar32 w2 = elementList[1].getCharacter(); + const UChar32 w3 = elementList[2].getCharacter(); + const UChar32 w4 = elementList[3].getCharacter(); + const UChar32 w5 = elementList[4].getCharacter(); + const UChar32 w6 = elementList[5].getCharacter(); + + length = 1; + if (w1 != INVALID) { + arr[0] = w1; + concatChar(u"UW1:", arr, length, featureList[listLength++], status); + } + if (w2 != INVALID) { + arr[0] = w2; + concatChar(u"UW2:", arr, length, featureList[listLength++], status); + } + if (w3 != INVALID) { + arr[0] = w3; + concatChar(u"UW3:", arr, length, featureList[listLength++], status); + } + if (w4 != INVALID) { + arr[0] = w4; + concatChar(u"UW4:", arr, length, featureList[listLength++], status); + } + if (w5 != INVALID) { + arr[0] = w5; + concatChar(u"UW5:", arr, length, featureList[listLength++], status); + } + if (w6 != INVALID) { + arr[0] = w6; + concatChar(u"UW6:", arr, length, featureList[listLength++], status); + } + length = 2; + if (w2 != INVALID && w3 != INVALID) { + arr[0] = w2; + arr[1] = w3; + concatChar(u"BW1:", arr, length, featureList[listLength++], status); + } + if (w3 != INVALID && w4 != INVALID) { + arr[0] = w3; + arr[1] = w4; + concatChar(u"BW2:", arr, length, featureList[listLength++], status); + } + if (w4 != INVALID && w5 != INVALID) { + arr[0] = w4; + arr[1] = w5; + concatChar(u"BW3:", arr, length, featureList[listLength++], status); + } + length = 3; + if (w1 != INVALID && w2 != INVALID && w3 != INVALID) { + arr[0] = w1; + arr[1] = w2; + arr[2] = w3; + concatChar(u"TW1:", arr, length, featureList[listLength++], status); + } + if (w2 != INVALID && w3 != INVALID && w4 != INVALID) { + arr[0] = w2; + arr[1] = w3; + arr[2] = w4; + concatChar(u"TW2:", arr, length, featureList[listLength++], status); + } + if (w3 != INVALID && w4 != INVALID && w5 != INVALID) { + arr[0] = w3; + arr[1] = w4; + arr[2] = w5; + concatChar(u"TW3:", arr, length, featureList[listLength++], status); + } + if (w4 != INVALID && w5 != INVALID && w6 != INVALID) { + arr[0] = w4; + arr[1] = w5; + arr[2] = w6; + concatChar(u"TW4:", arr, length, featureList[listLength++], status); + } + if (isValid(elementList[0])) { + writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0, + elementList[0].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[1])) { + writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0, + elementList[1].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[2])) { + writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0, + elementList[2].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[3])) { + writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0, + elementList[3].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[4])) { + writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0, + elementList[4].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[5])) { + writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0, + elementList[5].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[1]) && isValid(elementList[2])) { + writeString(UnicodeString(u"BB1:") + .append(elementList[1].getUblock(), 0, elementList[1].getLength()) + .append(elementList[2].getUblock(), 0, elementList[2].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[2]) && isValid(elementList[3])) { + writeString(UnicodeString(u"BB2:") + .append(elementList[2].getUblock(), 0, elementList[2].getLength()) + .append(elementList[3].getUblock(), 0, elementList[3].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[3]) && isValid(elementList[4])) { + writeString(UnicodeString(u"BB3:") + .append(elementList[3].getUblock(), 0, elementList[3].getLength()) + .append(elementList[4].getUblock(), 0, elementList[4].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) { + writeString(UnicodeString(u"TB1:") + .append(elementList[0].getUblock(), 0, elementList[0].getLength()) + .append(elementList[1].getUblock(), 0, elementList[1].getLength()) + .append(elementList[2].getUblock(), 0, elementList[2].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) { + writeString(UnicodeString(u"TB2:") + .append(elementList[1].getUblock(), 0, elementList[1].getLength()) + .append(elementList[2].getUblock(), 0, elementList[2].getLength()) + .append(elementList[3].getUblock(), 0, elementList[3].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) { + writeString(UnicodeString(u"TB3:") + .append(elementList[2].getUblock(), 0, elementList[2].getLength()) + .append(elementList[3].getUblock(), 0, elementList[3].getLength()) + .append(elementList[4].getUblock(), 0, elementList[4].getLength()), + featureList[listLength++], status); + } + if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) { + writeString(UnicodeString(u"TB4:") + .append(elementList[3].getUblock(), 0, elementList[3].getLength()) + .append(elementList[4].getUblock(), 0, elementList[4].getLength()) + .append(elementList[5].getUblock(), 0, elementList[5].getLength()), + featureList[listLength++], status); + } + if (U_FAILURE(status)) { + return; + } + int32_t score = fNegativeSum; + for (int32_t j = 0; j < listLength; j++) { + UnicodeString key(featureList[j]); + if (fModel.containsKey(key)) { + score += (2 * fModel.geti(key)); + } + } + if (score > 0) { + boundary.addElement(index, status); + numBreaks++; + } +} + +int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + int32_t index = 0; + int32_t length = inString.countChar32(); + UChar32 w1, w2, w3, w4, w5, w6; + w1 = w2 = w3 = w4 = w5 = w6 = INVALID; + if (length > 0) { + w3 = inString.char32At(0); + index += U16_LENGTH(w3); + } + if (length > 1) { + w4 = inString.char32At(index); + index += U16_LENGTH(w4); + } + if (length > 2) { + w5 = inString.char32At(index); + index += U16_LENGTH(w5); + } + if (length > 3) { + w6 = inString.char32At(index); + index += U16_LENGTH(w6); + } + + const UnicodeString b1(INVALID); + const UnicodeString b2(b1); + const UnicodeString b3(getUnicodeBlock(w3, status)); + const UnicodeString b4(getUnicodeBlock(w4, status)); + const UnicodeString b5(getUnicodeBlock(w5, status)); + const UnicodeString b6(getUnicodeBlock(w6, status)); + + elementList[0].setCharAndUblock(w1, b1); + elementList[1].setCharAndUblock(w2, b2); + elementList[2].setCharAndUblock(w3, b3); + elementList[3].setCharAndUblock(w4, b4); + elementList[4].setCharAndUblock(w5, b5); + elementList[5].setCharAndUblock(w6, b6); + + return index; +} + +UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const { + if (U_FAILURE(status)) { + return UnicodeString(INVALID); + } + + UBlockCode block = ublock_getCode(ch); + if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) { + return UnicodeString(INVALID); + } else { + UnicodeString empty; + // Same as sprintf("%03d", block) + return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3); + } +} + +void MlBreakEngine::loadMLModel(UErrorCode &error) { + // BudouX's model consists of pairs of the feature and its score. + // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the + // corresponding feature's score. + + if (U_FAILURE(error)) return; + + int32_t keySize = 0; + int32_t valueSize = 0; + int32_t stringLength = 0; + UnicodeString key; + StackUResourceBundle stackTempBundle; + ResourceDataValue modelKey; + + LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error)); + UResourceBundle* rb = rbp.orphan(); + // get modelValues + LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error)); + const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error); + if (U_FAILURE(error)) return; + + // get modelKeys + ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error); + ResourceArray stringArray = modelKey.getArray(error); + keySize = stringArray.getSize(); + if (U_FAILURE(error)) return; + + for (int32_t idx = 0; idx < keySize; idx++) { + stringArray.getValue(idx, modelKey); + key = UnicodeString(modelKey.getString(stringLength, error)); + if (U_SUCCESS(error)) { + U_ASSERT(idx < valueSize); + fNegativeSum -= value[idx]; + fModel.puti(key, value[idx], error); + } + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h new file mode 100644 index 00000000000..8943fa3414f --- /dev/null +++ b/icu4c/source/common/mlbe.h @@ -0,0 +1,152 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef MLBREAKENGINE_H +#define MLBREAKENGINE_H + +#include "hash.h" +#include "unicode/uniset.h" +#include "unicode/utext.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +#if !UCONFIG_NO_BREAK_ITERATION + +/** + * A class used to encapsulate a character and its unicode block index + */ +class Element : public UMemory { + public: + /** + * Default constructor. + */ + Element(); + + /** + * Set the character and its unicode block. + * + * @param ch A unicode character. + * @param ublock The unicode block of the character. + */ + void setCharAndUblock(UChar32 ch, const UnicodeString& ublock); + + /** + * Get the unicode character. + * + * @return The unicode character. + */ + UChar32 getCharacter() const; + + /** + * Get the unicode character's unicode block. + * + * @return The unicode block. + */ + char16_t* getUblock() const; + + /** + * Get the length of the unicode block. + * + * @return The unicode block length. + */ + uint16_t getLength() const; + + private: + UChar32 character; + char16_t ublock[4]; + uint16_t length; +}; + +/** + * A machine learning break engine for the phrase breaking in Japanese. + */ +class MlBreakEngine : public UMemory { + public: + /** + * Constructor. + * + * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and + * alphabet. + * @param closePunctuationSet An UnicodeSet with close punctuation. + * @param status Information on any errors encountered. + */ + MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, + const UnicodeSet &closePunctuationSet, UErrorCode &status); + + /** + * Virtual destructor. + */ + virtual ~MlBreakEngine(); + + public: + /** + * Divide up a range of characters handled by this break engine. + * + * @param inText A UText representing the text + * @param rangeStart The start of the range of the characters + * @param rangeEnd The end of the range of the characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @param inString The normalized string of text ranging from rangeStart to rangeEnd + * @param inputMap The vector storing the native index of inText + * @param status Information on any errors encountered. + * @return The number of breaks found + */ + int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd, + UVector32 &foundBreaks, const UnicodeString &inString, + const LocalPointer &inputMap, UErrorCode &status) const; + + private: + /** + * Load the machine learning's model file. + * + * @param error Information on any errors encountered. + */ + void loadMLModel(UErrorCode &error); + + /** + * Get the character's unicode block code defined in UBlockCode. + * + * @param ch A character. + * @param error Information on any errors encountered. + * @return The unicode block code which is 3 digits with '0' added in the beginning if the code + * is less than 3 digits. + * + */ + UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const; + + /** + * Initialize the element list from the input string. + * + * @param inString A input string to be segmented. + * @param elementList A list to store the first six characters and their unicode block codes. + * @param status Information on any errors encountered. + * @return The number of code units of the first six characters in inString. + */ + int32_t initElementList(const UnicodeString &inString, Element* elementList, + UErrorCode &status) const; + + /** + * Evaluate whether the index is a potential breakpoint. + * + * @param elementList A list including 6 elements for the breakpoint evaluation. + * @param index The breakpoint index to be evaluated. + * @param numBreaks The accumulated number of breakpoints. + * @param boundary A vector including the index of the breakpoint. + * @param status Information on any errors encountered. + */ + void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks, + UVector32 &boundary, UErrorCode &status) const; + + UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; + UnicodeSet fClosePunctuationSet; + Hashtable fModel; + int32_t fNegativeSum; +}; + +#endif + +U_NAMESPACE_END + +/* MLBREAKENGINE_H */ +#endif diff --git a/icu4c/source/common/sources.txt b/icu4c/source/common/sources.txt index e5c39dd2ce3..90171fe9bd4 100644 --- a/icu4c/source/common/sources.txt +++ b/icu4c/source/common/sources.txt @@ -43,6 +43,7 @@ locutil.cpp lsr.cpp lstmbe.cpp messagepattern.cpp +mlbe.cpp normalizer2.cpp normalizer2impl.cpp normlzr.cpp diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h index bbc232d1ed8..3818ca02ef8 100644 --- a/icu4c/source/common/unicode/uconfig.h +++ b/icu4c/source/common/unicode/uconfig.h @@ -323,6 +323,16 @@ # define UCONFIG_NO_NORMALIZATION 0 #endif +/** + * \def UCONFIG_USE_ML_PHRASE_BREAKING + * This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary. + * + * @internal + */ +#ifndef UCONFIG_USE_ML_PHRASE_BREAKING +# define UCONFIG_USE_ML_PHRASE_BREAKING 0 +#endif + #if UCONFIG_NO_NORMALIZATION /* common library */ /* ICU 50 CJK dictionary BreakIterator uses normalization */ diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py index 899cba25b48..2608cb0227b 100644 --- a/icu4c/source/data/BUILDRULES.py +++ b/icu4c/source/data/BUILDRULES.py @@ -27,6 +27,7 @@ def generate(config, io, common_vars): requests += generate_conversion_mappings(config, io, common_vars) requests += generate_brkitr_brk(config, io, common_vars) requests += generate_brkitr_lstm(config, io, common_vars) + requests += generate_brkitr_adaboost(config, io, common_vars) requests += generate_stringprep(config, io, common_vars) requests += generate_brkitr_dictionaries(config, io, common_vars) requests += generate_normalization(config, io, common_vars) @@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars): category = "brkitr_rules", dep_targets = [DepTarget("cnvalias"), - DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")], + DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")], input_files = input_files, output_files = output_files, tool = IcuTool("genbrk"), @@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars): ) ] +def generate_brkitr_adaboost(config, io, common_vars): + input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")] + input_basenames = [v.filename[16:] for v in input_files] + output_files = [ + OutFile("brkitr/%s.res" % v[:-4]) + for v in input_basenames + ] + return [ + RepeatedOrSingleExecutionRequest( + name = "adaboost_res", + category = "brkitr_adaboost", + dep_targets = [], + input_files = input_files, + output_files = output_files, + tool = IcuTool("genrb"), + args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} " + "-k " + "{INPUT_BASENAME}", + format_with = { + }, + repeat_with = { + "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) + } + ) + ] + def generate_tree( config, io, diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt new file mode 100644 index 00000000000..0500ff73fbf --- /dev/null +++ b/icu4c/source/data/brkitr/adaboost/jaml.txt @@ -0,0 +1,940 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +jaml { + modelKeys { + "BB2:062071", + "UB3:061", + "UB3:071", + "TB2:062062062", + "TB4:062062062", + "UB3:063", + "UB4:071", + "BB3:062062", + "UB4:062", + "BB1:062071", + "BB1:062061", + "UB4:061", + "TB1:071071062", + "TB3:062063063", + "UB2:061", + "TB1:062071062", + "TB3:062062062", + "BB2:063063", + "UW3:は", + "UW3:に", + "TB3:062071062", + "UW3:が", + "UW4:こ", + "UB5:061", + "UW3:と", + "TB4:063063063", + "UW4:て", + "TB2:062062061", + "UW3:。", + "UW4:お", + "UW3:の", + "BB3:071071", + "BB3:062071", + "UW3:お", + "UW3:し", + "UW4:、", + "UW4:の", + "UW3:を", + "UW4:。", + "UW3:、", + "UW5:で", + "UW4:あ", + "BB2:062062", + "UW4:っ", + "UW5:っ", + "UW3:も", + "UW5:う", + "UW3:「", + "UW5:な", + "UW4:そ", + "UW4:る", + "UW3:っ", + "UW4:「", + "UW4:い", + "BB2:087087", + "UB4:087", + "UW5:に", + "BW3:もの", + "UW5:し", + "UW6:う", + "BW2:とい", + "UW4:に", + "UW3:る", + "TB2:071062071", + "UW4:で", + "UW5:が", + "BB1:071071", + "UW5:は", + "UW4:は", + "UW4:れ", + "UW5:き", + "BB2:071062", + "BB2:071071", + "UW3:・", + "BB2:071087", + "BB2:061062", + "TB1:062061062", + "UW3:れ", + "BB2:087062", + "TB2:087087087", + "UW4:ら", + "TB1:071071071", + "UB2:071", + "TB1:062062087", + "UW5:す", + "UW5:ん", + "UW3:で", + "UW4:が", + "UW3:こ", + "TB4:071062062", + "UW3:ら", + "UW6:に", + "UW6:。", + "UW3:た", + "TB1:061071071", + "UW5:く", + "UB1:063", + "UW1:そ", + "UW3:う", + "BW3:とい", + "BW3:とこ", + "UW3:ま", + "BW3:こと", + "UW2:っ", + "UW5:・", + "TB3:062062061", + "UW3:き", + "UW4:ん", + "UB3:062", + "UW3:く", + "UW3:」", + "UW5:あ", + "BB2:062087", + "BW3:いう", + "UW5:れ", + "UW2:一", + "UW3:,", + "UW1:に", + "UW2:と", + "TB2:071071062", + "TB2:071071071", + "UW5:を", + "UW4:り", + "BW1:から", + "UW3:ち", + "BW3:いい", + "UW2:は", + "UW6:た", + "TB1:063063062", + "UW4:1", + "UW4:や", + "UW2:ん", + "UW3:]", + "UW4:ほ", + "TB3:062087087", + "BW2:であ", + "UW4:だ", + "BB3:071062", + "TB1:087087087", + "BW3:・・", + "BW3:とき", + "UW4:を", + "UW3:て", + "UW4:か", + "UW2:そ", + "TB4:071071062", + "TB2:062061071", + "UW2:を", + "UW4:ご", + "UW2:で", + "TB3:071071071", + "BB1:087087", + "UW2:し", + "UW4:出", + "UW2:ま", + "UW4:,", + "UW5:と", + "UW4:ど", + "BW3:して", + "UW1:で", + "BB2:061071", + "BW3:ため", + "BW2:とし", + "BW2:ない", + "BW2:てい", + "UW3:間", + "UW3:!", + "UW5:ー", + "UW4:す", + "UW4:!", + "BW1:とが", + "UW5:の", + "TB4:062062071", + "TB2:061071071", + "UW6:・", + "UW3:.", + "UW2:て", + "UW3:笑", + "UW2:こ", + "UW5:も", + "BW3:よう", + "UW3:人", + "UW2:の", + "UW3:か", + "UW3:日", + "UW1:い", + "BW2:とこ", + "UW4:私", + "UW3:…", + "UW2:に", + "UW3:今", + "BB3:087062", + "UB3:055", + "UW4:(", + "BB1:087071", + "UW1:な", + "BB3:063063", + "UW5:来", + "UW3:?", + "TW3:ている", + "UW4:」", + "UW4:前", + "BW1:いう", + "UW4:つ", + "UW3:)", + "BW1:では", + "UW2:る", + "UW5:そ", + "UW4:ー", + "TW2:気に入", + "UW4:笑", + "UW4:ひ", + "TB4:087087087", + "UW4:け", + "UW2:も", + "BW3:ちょ", + "BW3:出来", + "TB2:062071062", + "UW4:『", + "UW3:[", + "UW4:2", + "UW5:つ", + "TB1:061071062", + "UW3:1", + "BW3:から", + "UB5:071", + "UW4:ま", + "UW3:ば", + "UW3:り", + "BW3:その", + "UW3:ご", + "UW4:わ", + "BW2:てお", + "TB2:071062062", + "BW1:ない", + "UW2:よ", + "UB2:087", + "UW6:の", + "UW2:毎", + "UW2:結", + "TW4:の京都", + "UW3:さ", + "UW2:最", + "BW2:です", + "UW2:」", + "UW5:え", + "UW3:だ", + "TW4:ところ", + "UW4:.", + "UB1:062", + "UW6:て", + "UW1:が", + "BW2:、と", + "UW3:0", + "UW3:ん", + "UW3:中", + "UW4:よ", + "BW3:この", + "UW2:が", + "UW3:み", + "TW2:ではな", + "UW6:と", + "UW4:[", + "TW3:、ある", + "BW3:ころ", + "UW4:?", + "UW6:、", + "UW4:電", + "BB1:062040", + "UW3:後", + "UW5:い", + "UW2:、", + "UW5:て", + "BB2:062040", + "UW3:真", + "UW3:そ", + "UW5:さ", + "UB5:087", + "TW3:という", + "UW3:分", + "UB6:071", + "BW3:なっ", + "UW4:ろ", + "BB2:061061", + "TW3:ところ", + "UB1:071", + "UW1:、", + "BW1:とか", + "UW3:な", + "UW6:り", + "UW4:間", + "UW3:べ", + "UW5:べ", + "TB4:062071062", + "UW4:]", + "BW2:には", + "UW5:々", + "BW1:。・", + "BW1:その", + "UW1:す", + "UW4:)", + "UW6:っ", + "TB3:063063063", + "TB3:062071071", + "UB5:063", + "BW1:かも", + "UW6:る", + "TB4:062063063", + "UW3:ど", + "TW3:である", + "TW4:くらい", + "BW1:最近", + "BW1:しい", + "BW1:とも", + "BW2:と同", + "TW1:という", + "UW2:さ", + "BW2:帯電", + "TB1:071062062", + "BW3:そし", + "UW2:。", + "UW5:か", + "UW5:こ", + "BW3:ない", + "BW1:んな", + "BW2:でき", + "UW4:3", + "UW3:け", + "TW4:ことが", + "BW1:こと", + "UB3:087", + "UW3:電", + "UW3:よ", + "BW1:たと", + "UW5:ま", + "UW5:た", + "UW5:ち", + "UW2:け", + "UW5:だ", + "UW3:度", + "BW1:たい", + "UW4:使", + "UW2:き", + "TW4:かなり", + "UB6:063", + "BB1:062062", + "UW4:込", + "TW3:と言っ", + "UW6:だ", + "UW5:り", + "UW5:よ", + "BW3:どう", + "UW4:…", + "UW3:や", + "BW1:かし", + "BW3:かっ", + "UW4:今", + "UW3:『", + "UW4:思", + "UB2:063", + "UW4:く", + "UW3:京", + "UW6:ー", + "UW1:ん", + "BW1:うな", + "TB2:062061061", + "UW1:と", + "TB4:062063062", + "TB2:061062062", + "BW1:この", + "BW2:ので", + "UW4:み", + "UW5:わ", + "UW6:や", + "BW1:れて", + "UW2:や", + "UW6:こ", + "UW4:な", + "UW5:め", + "BW1:もう", + "TB4:071062071", + "BW1:より", + "UW4:合", + "UW6:け", + "BW1:少し", + "BW2:でし", + "UW4:と", + "TB1:063063063", + "UW3:ー", + "BW2:くな", + "UW2:く", + "UW2:我", + "BW2:いも", + "BW3:わか", + "TB2:071063071", + "UW4:も", + "UW1:あ", + "UW4:最", + "BW1:るの", + "UW2:全", + "UW6:0", + "UW4:放", + "UW4:京", + "BW3:かけ", + "UW2:少", + "BW3:もう", + "UW2:多", + "UW2:う", + "TB1:062062040", + "UW1:を", + "UW3:光", + "BW1:!!", + "UW2:ャ", + "BW3:すぐ", + "UW4:帯", + "UW6:し", + "BW3:でも", + "BW2:、そ", + "TB3:071087087", + "TB2:063062071", + "UW3:わ", + "UB4:063", + "TB4:071071071", + "UW5:都", + "UW5:ず", + "UW2:バ", + "UW2:京", + "UW3:ゃ", + "BW1:い、", + "BW3:よく", + "BW1:たら", + "BW2:のよ", + "UW2:思", + "BW1:うに", + "BW1:の間", + "UW6:ん", + "UW6:ず", + "BW1:った", + "TW3:ること", + "BW3:とて", + "TW1:ような", + "UW6:ぱ", + "TB3:063071062", + "TW4:って、", + "TW4:なんて", + "TW2:その後", + "UW6:ら", + "TW4:ことに", + "UW3:>", + "TW3:てしま", + "UW3:い", + "TB4:071062061", + "UW2:ひ", + "UW6:め", + "UW6:で", + "BW3:なる", + "UW5:ご", + "BW2:りし", + "UW6:電", + "UW1:は", + "BW1:いも", + "BW3:すご", + "UW4:通", + "BW3:おり", + "BW3:かか", + "BW1:思い", + } + modelValues:intvector { + 1800, + 271, + -857, + -417, + 285, + -583, + 388, + 828, + -853, + -820, + 502, + -708, + 358, + 1341, + -586, + -451, + 257, + -1876, + 2052, + 1698, + -458, + 2048, + 1182, + -551, + 980, + 773, + -1453, + -152, + 3201, + 2865, + 1203, + 144, + -369, + -2539, + -613, + -3574, + -1111, + 3110, + -3022, + 2039, + -1091, + 1241, + -560, + -1412, + 625, + 1350, + 297, + -2404, + -595, + 1007, + -1829, + -1662, + 3213, + 270, + -911, + 178, + -727, + 2716, + -484, + -344, + 929, + -1236, + 760, + -299, + -419, + -728, + 122, + -704, + -605, + -1507, + 545, + -68, + -320, + 1498, + 953, + -323, + -575, + -673, + 520, + -450, + -1767, + -247, + 56, + 231, + -764, + 536, + 794, + -703, + -566, + 51, + 390, + 52, + -182, + 466, + 133, + 354, + 107, + 492, + 488, + -1194, + 1145, + -847, + 812, + 151, + -517, + -314, + -553, + -783, + -117, + 736, + -88, + -598, + 569, + 606, + 287, + 744, + 1739, + -217, + -219, + -144, + 234, + -649, + -757, + 834, + -819, + 869, + -275, + -267, + 154, + 653, + 594, + 255, + 1018, + 1124, + 284, + -1624, + -372, + 440, + -184, + -1936, + 1318, + -1124, + 453, + -92, + -343, + 175, + 182, + -886, + 930, + -223, + -57, + -113, + 103, + -200, + 510, + -2099, + -498, + 385, + 80, + -156, + 360, + 1289, + 771, + -1114, + -399, + 870, + 1230, + 79, + 472, + -1596, + -1092, + -572, + 55, + -151, + -124, + 1316, + -248, + 1280, + -125, + -284, + -1023, + 862, + 84, + 417, + 568, + -88, + -528, + 910, + 674, + -212, + 894, + -121, + 1108, + 762, + 260, + -197, + 91, + -53, + 1117, + -645, + -868, + -611, + 220, + 422, + 1431, + -532, + -157, + -476, + -846, + -1309, + -1614, + 1225, + 302, + -738, + -260, + 892, + -778, + -193, + 1221, + -779, + 489, + 420, + -85, + -525, + -830, + 26, + 270, + 439, + -120, + 1263, + -795, + 291, + -1310, + -23, + 347, + 312, + -107, + -114, + 701, + 830, + 1309, + -451, + 260, + -1080, + 536, + 188, + -60, + 643, + -1184, + 31, + -194, + -51, + -514, + -442, + -120, + 649, + 410, + 882, + -75, + -341, + -718, + -128, + 340, + -1245, + -164, + -1052, + 70, + -256, + 279, + 786, + 40, + -177, + 97, + -411, + 222, + -89, + -277, + -146, + 414, + 483, + 21, + -339, + -406, + -360, + -450, + -14, + -36, + 513, + 252, + 54, + -501, + -478, + 450, + -36, + -644, + -392, + 714, + 643, + -341, + 91, + -1018, + 34, + -177, + 123, + 80, + -695, + -44, + -357, + 253, + -389, + 613, + 515, + 418, + -396, + -553, + 193, + 298, + -334, + -57, + -315, + -77, + 33, + 88, + 137, + 280, + -448, + 196, + -136, + -295, + -329, + -92, + -360, + -132, + -288, + -45, + -43, + 174, + 75, + -60, + 330, + 360, + 217, + 130, + 473, + -41, + -23, + -340, + -530, + -69, + -71, + -115, + 297, + -240, + 229, + 507, + -348, + 171, + -320, + 239, + 16, + -195, + -277, + -41, + 69, + 280, + -264, + 30, + 249, + -97, + -163, + -221, + 96, + 83, + 82, + -218, + -93, + -53, + 40, + 28, + 285, + 27, + 283, + -211, + -92, + 214, + -225, + -54, + 53, + 105, + -198, + -53, + -277, + 198, + 184, + -264, + -106, + 14, + 185, + -155, + 185, + 106, + -119, + 53, + 208, + 92, + 262, + 106, + -52, + 105, + -25, + -79, + 104, + 141, + 129, + -114, + 26, + 64, + -113, + 26, + 77, + -64, + 13, + 13, + 26, + 89, + 115, + -49, + 89, + -114, + 51, + 64, + -64, + -51, + -38, + 89, + 13, + -64, + 13, + -48, + 76, + 63, + 62, + 13, + 112, + -76, + -50, + -13, + -49, + 63, + -50, + 13, + 13, + -50, + 24, + -12, + 24, + 12, + 24, + 12, + -12, + -24, + 12, + -12, + -12, + 12, + -12, + } +} \ No newline at end of file diff --git a/icu4c/source/python/icutools/databuilder/filtration.py b/icu4c/source/python/icutools/databuilder/filtration.py index 27d08b0a772..e9339a08955 100644 --- a/icu4c/source/python/icutools/databuilder/filtration.py +++ b/icu4c/source/python/icutools/databuilder/filtration.py @@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io): default_filter_json = "exclude" if config.strategy == "additive" else "include" for category in all_categories: filter_json = default_filter_json - # Special default for category "brkitr_lstm" as "exclude" for now. - if "brkitr_lstm" == category: + # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now. + if "brkitr_lstm" == category or "brkitr_adaboost" == category: filter_json = "exclude" # Figure out the correct filter to create for now. if "featureFilters" in json_data and category in json_data["featureFilters"]: diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index 9676ed4856d..7460caa7cdd 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -211,7 +211,7 @@ group: breakiterator brkiter.o brkeng.o ubrk.o rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o rbbidata.o rbbirb.o rbbi_cache.o - dictionarydata.o dictbe.o lstmbe.o + dictionarydata.o dictbe.o lstmbe.o mlbe.o # BreakIterator::makeInstance() factory implementation makes for circular dependency # between BreakIterator base and FilteredBreakIteratorBuilder. filteredbrk.o diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 17c05fb0d44..7afdb9ab828 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -42,6 +42,7 @@ #include "charstr.h" #include "cmemory.h" #include "cstr.h" +#include "cstring.h" #include "intltest.h" #include "lstmbe.h" #include "rbbitst.h" @@ -835,9 +836,28 @@ void RBBITest::TestExtended() { delete tp.bi; tp.bi = BreakIterator::createLineInstance(locale, status); skipTest = false; +#if UCONFIG_USE_ML_PHRASE_BREAKING + if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) { + // skip test cases of JP's phrase breaking when ML is enabled. + skipTest = true; + } +#endif charIdx += 5; break; } + if (testString.compare(charIdx-1, 8, u"") == 0) { + delete tp.bi; + tp.bi = BreakIterator::createLineInstance(locale, status); + skipTest = false; +#if !UCONFIG_USE_ML_PHRASE_BREAKING + if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) { + // skip test cases of JP's phrase breaking when ML is disabled. + skipTest = true; + } +#endif + charIdx += 7; + break; + } if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; tp.bi = BreakIterator::createSentenceInstance(locale, status); diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 72bd15803d6..40c6745dd06 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1913,6 +1913,26 @@ Bangkok)• •\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020• +#phrase breaking test cases for the ML solution + +#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た• +•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• +#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• +•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d• +#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 +#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし) +•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09• +#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です +•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059• +#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!! +•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01• +#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します +•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059• +#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど +•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069• + + +#phrase breaking test cases for the dictionary based solution #[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• •\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002• @@ -2005,8 +2025,8 @@ Bangkok)• #大韓民國은 民主共和國이다 #•大韓民國은 •民主•共和國이다• # All the tests for ja@lw=phrase should also work in Korean. -#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• -•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002• +#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• +•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d• #9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た• •\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• -- 2.40.0