From b6b7b045e9cef2c942efd267bb89c5a545017f0c Mon Sep 17 00:00:00 2001
From: Shuhei Iitsuka <tushuhei@google.com>
Date: Fri, 29 Jul 2022 12:08:01 +0800
Subject: [PATCH] ICU-22100 Incorporate BudouX into ICU (C++)

---
 .github/adaboost.json                         |  14 +
 .github/workflows/icu_ci.yml                  |  11 +
 icu4c/source/common/BUILD.bazel               |   1 +
 icu4c/source/common/common.vcxproj            |   2 +
 icu4c/source/common/common.vcxproj.filters    |   6 +
 icu4c/source/common/common_uwp.vcxproj        |   2 +
 icu4c/source/common/dictbe.cpp                |  25 +-
 icu4c/source/common/dictbe.h                  |   4 +
 icu4c/source/common/mlbe.cpp                  | 452 +++++++++
 icu4c/source/common/mlbe.h                    | 152 +++
 icu4c/source/common/sources.txt               |   1 +
 icu4c/source/common/unicode/uconfig.h         |  10 +
 icu4c/source/data/BUILDRULES.py               |  29 +-
 icu4c/source/data/brkitr/adaboost/jaml.txt    | 940 ++++++++++++++++++
 .../python/icutools/databuilder/filtration.py |   4 +-
 icu4c/source/test/depstest/dependencies.txt   |   2 +-
 icu4c/source/test/intltest/rbbitst.cpp        |  20 +
 icu4c/source/test/testdata/rbbitst.txt        |  24 +-
 18 files changed, 1690 insertions(+), 9 deletions(-)
 create mode 100644 .github/adaboost.json
 create mode 100644 icu4c/source/common/mlbe.cpp
 create mode 100644 icu4c/source/common/mlbe.h
 create mode 100644 icu4c/source/data/brkitr/adaboost/jaml.txt
diff --git a/.github/adaboost.json b/.github/adaboost.json
new file mode 100644
index 00000000000..639fd6a99da
--- /dev/null
+++ b/.github/adaboost.json
@@ -0,0 +1,14 @@
+// Â© 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+//
+// Include Japanese adaboost model.
+{
+  "featureFilters": {
+    "brkitr_adaboost": {
+      "includelist": [
+	    "jaml"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml
index 90bce1ed542..1293e5edbb0 100644
--- a/.github/workflows/icu_ci.yml
+++ b/.github/workflows/icu_ci.yml
@@ -334,6 +334,17 @@ jobs:
           make clean;
           make -j2 check
 
+  # Test adaboost
+  adaboost-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - run: |
+          cd icu4c/source;
+          ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+          make clean;
+          make -j2 check
+
   # Build and run testmap
   testmap:
     runs-on: ubuntu-latest
diff --git a/icu4c/source/common/BUILD.bazel b/icu4c/source/common/BUILD.bazel
index e385d3b243f..47d3d24bf51 100644
--- a/icu4c/source/common/BUILD.bazel
+++ b/icu4c/source/common/BUILD.bazel
@@ -342,6 +342,7 @@ cc_library(
         "dictionarydata.cpp",
         "filteredbrk.cpp",
         "lstmbe.cpp",
+        "mlbe.cpp",
         "rbbi.cpp",
         "rbbi_cache.cpp",
         "rbbidata.cpp",
diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj
index e35e1b0cff7..2b4cc05357a 100644
--- a/icu4c/source/common/common.vcxproj
+++ b/icu4c/source/common/common.vcxproj
@@ -88,6 +88,7 @@
     <ClCompile Include="brkiter.cpp" />
     <ClCompile Include="dictbe.cpp" />
     <ClCompile Include="lstmbe.cpp" />
+    <ClCompile Include="mlbe.cpp" />
     <ClCompile Include="pluralmap.cpp" />
     <ClCompile Include="rbbi.cpp" />
     <ClCompile Include="rbbidata.cpp" />
@@ -282,6 +283,7 @@
     <ClInclude Include="brkeng.h" />
     <ClInclude Include="dictbe.h" />
     <ClInclude Include="lstmbe.h" />
+    <ClInclude Include="mlbe.h" />
     <ClInclude Include="rbbidata.h" />
     <ClInclude Include="rbbinode.h" />
     <ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters
index 38bc0c1b869..28a5d903429 100644
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@@ -76,6 +76,9 @@
     <ClCompile Include="lstmbe.cpp">
       <Filter>break iteration</Filter>
     </ClCompile>
+    <ClCompile Include="mlbe.cpp">
+      <Filter>break iteration</Filter>
+    </ClCompile>
     <ClCompile Include="rbbi.cpp">
       <Filter>break iteration</Filter>
     </ClCompile>
@@ -660,6 +663,9 @@
     <ClInclude Include="lstmbe.h">
       <Filter>break iteration</Filter>
     </ClInclude>
+    <ClInclude Include="mlbe.h">
+      <Filter>break iteration</Filter>
+    </ClInclude>
     <ClInclude Include="rbbidata.h">
       <Filter>break iteration</Filter>
     </ClInclude>
diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj
index fc165629f29..5df0d57a7de 100644
--- a/icu4c/source/common/common_uwp.vcxproj
+++ b/icu4c/source/common/common_uwp.vcxproj
@@ -222,6 +222,7 @@
     <ClCompile Include="brkiter.cpp" />
     <ClCompile Include="dictbe.cpp" />
     <ClCompile Include="lstmbe.cpp" />
+    <ClCompile Include="mlbe.cpp" />
     <ClCompile Include="pluralmap.cpp" />
     <ClCompile Include="rbbi.cpp" />
     <ClCompile Include="rbbidata.cpp" />
@@ -417,6 +418,7 @@
     <ClInclude Include="brkeng.h" />
     <ClInclude Include="dictbe.h" />
     <ClInclude Include="lstmbe.h" />
+    <ClInclude Include="mlbe.h" />
     <ClInclude Include="rbbidata.h" />
     <ClInclude Include="rbbinode.h" />
     <ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index 9b5434d995a..0e420c67c5d 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1054,9 +1054,10 @@ foundBest:
  */
 static const uint32_t kuint32max = 0xFFFFFFFF;
 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
-: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
+: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
+    fMlBreakEngine = nullptr;
     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
     // Korean dictionary only includes Hangul syllables
     fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
@@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
         if (U_SUCCESS(status)) {
             setCharacters(fHangulWordSet);
         }
-    } else { //Chinese and Japanese
+    } else { // Chinese and Japanese
         UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+        isCj = true;
         if (U_SUCCESS(status)) {
             setCharacters(cjSet);
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+            fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
+                                               fClosePunctuationSet, status);
+            if (fMlBreakEngine == nullptr) {
+                status = U_MEMORY_ALLOCATION_ERROR;
+            }
+#else
             initJapanesePhraseParameter(status);
+#endif
         }
     }
     UTRACE_EXIT_STATUS(status);
@@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
 
 CjkBreakEngine::~CjkBreakEngine(){
     delete fDictionary;
+    delete fMlBreakEngine;
 }
 
 // The katakanaCost values below are based on the length frequencies of all
@@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
             }
         }
     }
-                
+
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+    // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
+    if (isPhraseBreaking && isCj) {
+        return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
+                                             inputMap, status);
+    }
+#endif
+
     // bestSnlp[i] is the snlp of the best segmentation of the first i
     // code points in the range to be matched.
     UVector32 bestSnlp(numCodePts + 1, status);
diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h
index ca1a3c28b7b..a2c761bdc3a 100644
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@@ -16,11 +16,13 @@
 
 #include "brkeng.h"
 #include "hash.h"
+#include "mlbe.h"
 #include "uvectr32.h"
 
 U_NAMESPACE_BEGIN
 
 class DictionaryMatcher;
+class MlBreakEngine;
 class Normalizer2;
 
 /*******************************************************************
@@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
 
   DictionaryMatcher        *fDictionary;
   const Normalizer2        *nfkcNorm2;
+  MlBreakEngine            *fMlBreakEngine;
+  bool                      isCj;
 
  private:
   // Load Japanese extensions.
diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp
new file mode 100644
index 00000000000..3ccf470e5b1
--- /dev/null
+++ b/icu4c/source/common/mlbe.cpp
@@ -0,0 +1,452 @@
+// Â© 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "cmemory.h"
+#include "mlbe.h"
+#include "uassert.h"
+#include "ubrkimpl.h"
+#include "unicode/resbund.h"
+#include "unicode/udata.h"
+#include "unicode/utf16.h"
+#include "uresimp.h"
+#include "util.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+Element::Element() : length(0) {}
+
+void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
+    character = ch;
+    U_ASSERT(idx.length() <= 3);
+    length = idx.length();
+    idx.extract(0, length, ublock);
+    ublock[length] = '\0';
+}
+
+UChar32 Element::getCharacter() const {
+    return character;
+}
+
+char16_t* Element::getUblock() const {
+    return (char16_t*)ublock;
+}
+
+uint16_t Element::getLength() const {
+    return length;
+}
+
+MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+                                 const UnicodeSet &closePunctuationSet, UErrorCode &status)
+    : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
+      fClosePunctuationSet(closePunctuationSet),
+      fModel(status),
+      fNegativeSum(0) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    loadMLModel(status);
+}
+
+MlBreakEngine::~MlBreakEngine() {}
+
+namespace {
+    const char16_t INVALID = u'|';
+    const int32_t MAX_FEATURE = 26;
+    const int32_t MAX_FEATURE_LENGTH = 14;
+
+    bool isValid(const Element& element) {
+        return element.getLength() != 1 || element.getUblock()[0] != INVALID;
+    }
+
+    void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
+        if (U_FAILURE(status)) {
+            return;
+        }
+        UnicodeString result(str);
+        for (int i = 0; i < length; i++) {
+            result.append(arr[i]);
+        }
+        U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
+        result.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
+    }
+
+    void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
+        U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
+        str.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
+    }
+}
+
+int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+                                       UVector32 &foundBreaks, const UnicodeString &inString,
+                                       const LocalPointer<UVector32> &inputMap,
+                                       UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    if (rangeStart >= rangeEnd) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    UVector32 boundary(inString.countChar32() + 1, status);
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    int32_t numBreaks = 0;
+    UChar32 ch;
+    UnicodeString index;
+    // The ML model groups six char to evaluate if the 4th char is a breakpoint.
+    // Like a sliding window, the elementList removes the first char and appends the new char from
+    // inString in each iteration so that its size always remains at six.
+    Element elementList[6];
+
+    int32_t codeUts = initElementList(inString, elementList, status);
+    int32_t length = inString.countChar32();
+
+    // Add a break for the start.
+    boundary.addElement(0, status);
+    numBreaks++;
+    if (U_FAILURE(status)) return 0;
+
+    for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
+        evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
+        if (i + 1 >= inString.countChar32()) break;
+        // Remove the first element and append a new element
+        uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
+        ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
+        index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
+        elementList[5].setCharAndUblock(ch, index);
+        if (ch != INVALID) {
+            codeUts += U16_LENGTH(ch);
+        }
+    }
+    if (U_FAILURE(status)) return 0;
+
+    // Add a break for the end if there is not one there already.
+    if (boundary.lastElementi() != inString.countChar32()) {
+        boundary.addElement(inString.countChar32(), status);
+        numBreaks++;
+    }
+
+    int32_t prevCPPos = -1;
+    int32_t prevUTextPos = -1;
+    int32_t correctedNumBreaks = 0;
+    for (int32_t i = 0; i < numBreaks; i++) {
+        int32_t cpPos = boundary.elementAti(i);
+        int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
+        U_ASSERT(cpPos > prevCPPos);
+        U_ASSERT(utextPos >= prevUTextPos);
+
+        if (utextPos > prevUTextPos) {
+            if (utextPos != rangeStart ||
+                (utextPos > 0 &&
+                 fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
+        } else {
+            // Normalization expanded the input text, the dictionary found a boundary
+            // within the expansion, giving two boundaries with the same index in the
+            // original text. Ignore the second. See ticket #12918.
+            --numBreaks;
+        }
+        prevCPPos = cpPos;
+        prevUTextPos = utextPos;
+    }
+    (void)prevCPPos;  // suppress compiler warnings about unused variable
+
+    UChar32 nextChar = utext_char32At(inText, rangeEnd);
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        // In phrase breaking, there has to be a breakpoint between Cj character and
+        // the number/open punctuation.
+        // E.g. ãæå­ãããã ãäº¬é½ã->ãâæå­âãããã ãâäº¬é½ã-> breakpoint between å­ andã
+        // E.g. ä¹è»çï¼ï¼ï¼ç¨åº¦ã ããã -> ä¹è»âçâï¼ï¼ï¼âç¨åº¦ã ããã -> breakpoint between ç and ï¼
+        // E.g. ãããã­ã´ãï¼µï½ï½ï½ï½ï½ï½ï¼ -> ãããâã­ã´ãâï¼µï½ï½ï½ï½ï½ï½ï¼-> breakpoint between ã and ï¼µ
+        if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+            foundBreaks.popi();
+            correctedNumBreaks--;
+        }
+    }
+
+    return correctedNumBreaks;
+}
+
+void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+                                         UVector32 &boundary, UErrorCode &status) const {
+    char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    UChar32 arr[4] = {-1, -1, -1, -1};
+    int32_t length = 0, listLength = 0;
+
+    const UChar32 w1 = elementList[0].getCharacter();
+    const UChar32 w2 = elementList[1].getCharacter();
+    const UChar32 w3 = elementList[2].getCharacter();
+    const UChar32 w4 = elementList[3].getCharacter();
+    const UChar32 w5 = elementList[4].getCharacter();
+    const UChar32 w6 = elementList[5].getCharacter();
+
+    length = 1;
+    if (w1 != INVALID) {
+        arr[0] = w1;
+        concatChar(u"UW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w2 != INVALID) {
+        arr[0] = w2;
+        concatChar(u"UW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID) {
+        arr[0] = w3;
+        concatChar(u"UW3:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID) {
+        arr[0] = w4;
+        concatChar(u"UW4:", arr, length, featureList[listLength++], status);
+    }
+    if (w5 != INVALID) {
+        arr[0] = w5;
+        concatChar(u"UW5:", arr, length, featureList[listLength++], status);
+    }
+    if (w6 != INVALID) {
+        arr[0] = w6;
+        concatChar(u"UW6:", arr, length, featureList[listLength++], status);
+    }
+    length = 2;
+    if (w2 != INVALID && w3 != INVALID) {
+        arr[0] = w2;
+        arr[1] = w3;
+        concatChar(u"BW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID && w4 != INVALID) {
+        arr[0] = w3;
+        arr[1] = w4;
+        concatChar(u"BW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID && w5 != INVALID) {
+        arr[0] = w4;
+        arr[1] = w5;
+        concatChar(u"BW3:", arr, length, featureList[listLength++], status);
+    }
+    length = 3;
+    if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
+        arr[0] = w1;
+        arr[1] = w2;
+        arr[2] = w3;
+        concatChar(u"TW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
+        arr[0] = w2;
+        arr[1] = w3;
+        arr[2] = w4;
+        concatChar(u"TW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
+        arr[0] = w3;
+        arr[1] = w4;
+        arr[2] = w5;
+        concatChar(u"TW3:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
+        arr[0] = w4;
+        arr[1] = w5;
+        arr[2] = w6;
+        concatChar(u"TW4:", arr, length, featureList[listLength++], status);
+    }
+    if (isValid(elementList[0])) {
+        writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
+                                                  elementList[0].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1])) {
+        writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
+                                                  elementList[1].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2])) {
+        writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
+                                                  elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3])) {
+        writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
+                                                  elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[4])) {
+        writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
+                                                  elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[5])) {
+        writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
+                                                  elementList[5].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1]) && isValid(elementList[2])) {
+        writeString(UnicodeString(u"BB1:")
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2]) && isValid(elementList[3])) {
+        writeString(UnicodeString(u"BB2:")
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3]) && isValid(elementList[4])) {
+        writeString(UnicodeString(u"BB3:")
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
+        writeString(UnicodeString(u"TB1:")
+                        .append(elementList[0].getUblock(), 0, elementList[0].getLength())
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
+        writeString(UnicodeString(u"TB2:")
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
+        writeString(UnicodeString(u"TB3:")
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
+        writeString(UnicodeString(u"TB4:")
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength())
+                        .append(elementList[5].getUblock(), 0, elementList[5].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
+    int32_t score = fNegativeSum;
+    for (int32_t j = 0; j < listLength; j++) {
+        UnicodeString key(featureList[j]);
+        if (fModel.containsKey(key)) {
+            score += (2 * fModel.geti(key));
+        }
+    }
+    if (score > 0) {
+        boundary.addElement(index, status);
+        numBreaks++;
+    }
+}
+
+int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
+                                         UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    int32_t index = 0;
+    int32_t length = inString.countChar32();
+    UChar32 w1, w2, w3, w4, w5, w6;
+    w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+    if (length > 0) {
+        w3 = inString.char32At(0);
+        index += U16_LENGTH(w3);
+    }
+    if (length > 1) {
+        w4 = inString.char32At(index);
+        index += U16_LENGTH(w4);
+    }
+    if (length > 2) {
+        w5 = inString.char32At(index);
+        index += U16_LENGTH(w5);
+    }
+    if (length > 3) {
+        w6 = inString.char32At(index);
+        index += U16_LENGTH(w6);
+    }
+
+    const UnicodeString b1(INVALID);
+    const UnicodeString b2(b1);
+    const UnicodeString b3(getUnicodeBlock(w3, status));
+    const UnicodeString b4(getUnicodeBlock(w4, status));
+    const UnicodeString b5(getUnicodeBlock(w5, status));
+    const UnicodeString b6(getUnicodeBlock(w6, status));
+
+    elementList[0].setCharAndUblock(w1, b1);
+    elementList[1].setCharAndUblock(w2, b2);
+    elementList[2].setCharAndUblock(w3, b3);
+    elementList[3].setCharAndUblock(w4, b4);
+    elementList[4].setCharAndUblock(w5, b5);
+    elementList[5].setCharAndUblock(w6, b6);
+
+    return index;
+}
+
+UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return UnicodeString(INVALID);
+    }
+
+    UBlockCode block = ublock_getCode(ch);
+    if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
+        return UnicodeString(INVALID);
+    } else {
+        UnicodeString empty;
+        // Same as sprintf("%03d", block)
+        return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
+    }
+}
+
+void MlBreakEngine::loadMLModel(UErrorCode &error) {
+    // BudouX's model consists of pairs of the feature and its score.
+    // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
+    // corresponding feature's score.
+
+    if (U_FAILURE(error)) return;
+
+    int32_t keySize = 0;
+    int32_t valueSize = 0;
+    int32_t stringLength = 0;
+    UnicodeString key;
+    StackUResourceBundle stackTempBundle;
+    ResourceDataValue modelKey;
+
+    LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
+    UResourceBundle* rb = rbp.orphan();
+    // get modelValues
+    LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
+    const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
+    if (U_FAILURE(error)) return;
+
+    // get modelKeys
+    ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
+    ResourceArray stringArray = modelKey.getArray(error);
+    keySize = stringArray.getSize();
+    if (U_FAILURE(error)) return;
+
+    for (int32_t idx = 0; idx < keySize; idx++) {
+        stringArray.getValue(idx, modelKey);
+        key = UnicodeString(modelKey.getString(stringLength, error));
+        if (U_SUCCESS(error)) {
+            U_ASSERT(idx < valueSize);
+            fNegativeSum -= value[idx];
+            fModel.puti(key, value[idx], error);
+        }
+    }
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h
new file mode 100644
index 00000000000..8943fa3414f
--- /dev/null
+++ b/icu4c/source/common/mlbe.h
@@ -0,0 +1,152 @@
+// Â© 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef MLBREAKENGINE_H
+#define MLBREAKENGINE_H
+
+#include "hash.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/**
+ * A class used to encapsulate a character and its unicode block index
+ */
+class Element : public UMemory {
+   public:
+    /**
+     * Default constructor.
+     */
+    Element();
+
+    /**
+     * Set the character and its unicode block.
+     *
+     * @param ch A unicode character.
+     * @param ublock The unicode block of the character.
+     */
+    void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
+
+    /**
+     * Get the unicode character.
+     *
+     * @return The unicode character.
+     */
+    UChar32 getCharacter() const;
+
+    /**
+     * Get the unicode character's unicode block.
+     *
+     * @return The unicode block.
+     */
+    char16_t* getUblock() const;
+
+    /**
+     * Get the length of the unicode block.
+     *
+     * @return The unicode block length.
+     */
+    uint16_t getLength() const;
+
+   private:
+    UChar32 character;
+    char16_t ublock[4];
+    uint16_t length;
+};
+
+/**
+ * A machine learning break engine for the phrase breaking in Japanese.
+ */
+class MlBreakEngine : public UMemory {
+   public:
+    /**
+     * Constructor.
+     *
+     * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
+     * alphabet.
+     * @param closePunctuationSet An UnicodeSet with close punctuation.
+     * @param status Information on any errors encountered.
+     */
+    MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+                    const UnicodeSet &closePunctuationSet, UErrorCode &status);
+
+    /**
+     * Virtual destructor.
+     */
+    virtual ~MlBreakEngine();
+
+   public:
+    /**
+     * Divide up a range of characters handled by this break engine.
+     *
+     * @param inText A UText representing the text
+     * @param rangeStart The start of the range of the characters
+     * @param rangeEnd The end of the range of the characters
+     * @param foundBreaks Output of C array of int32_t break positions, or 0
+     * @param inString The normalized string of text ranging from rangeStart to rangeEnd
+     * @param inputMap The vector storing the native index of inText
+     * @param status Information on any errors encountered.
+     * @return The number of breaks found
+     */
+    int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+                          UVector32 &foundBreaks, const UnicodeString &inString,
+                          const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
+
+   private:
+    /**
+     * Load the machine learning's model file.
+     *
+     * @param error Information on any errors encountered.
+     */
+    void loadMLModel(UErrorCode &error);
+
+    /**
+     * Get the character's unicode block code defined in UBlockCode.
+     *
+     * @param ch A character.
+     * @param error Information on any errors encountered.
+     * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
+     * is less than 3 digits.
+     *
+     */
+    UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
+
+    /**
+     * Initialize the element list from the input string.
+     *
+     * @param inString A input string to be segmented.
+     * @param elementList A list to store the first six characters and their unicode block codes.
+     * @param status Information on any errors encountered.
+     * @return The number of code units of the first six characters in inString.
+     */
+    int32_t initElementList(const UnicodeString &inString, Element* elementList,
+                            UErrorCode &status) const;
+
+    /**
+     * Evaluate whether the index is a potential breakpoint.
+     *
+     * @param elementList A list including 6 elements for the breakpoint evaluation.
+     * @param index The breakpoint index to be evaluated.
+     * @param numBreaks The accumulated number of breakpoints.
+     * @param boundary A vector including the index of the breakpoint.
+     * @param status Information on any errors encountered.
+     */
+    void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+                            UVector32 &boundary, UErrorCode &status) const;
+
+    UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+    UnicodeSet fClosePunctuationSet;
+    Hashtable fModel;
+    int32_t fNegativeSum;
+};
+
+#endif
+
+U_NAMESPACE_END
+
+/* MLBREAKENGINE_H */
+#endif
diff --git a/icu4c/source/common/sources.txt b/icu4c/source/common/sources.txt
index e5c39dd2ce3..90171fe9bd4 100644
--- a/icu4c/source/common/sources.txt
+++ b/icu4c/source/common/sources.txt
@@ -43,6 +43,7 @@ locutil.cpp
 lsr.cpp
 lstmbe.cpp
 messagepattern.cpp
+mlbe.cpp
 normalizer2.cpp
 normalizer2impl.cpp
 normlzr.cpp
diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h
index bbc232d1ed8..3818ca02ef8 100644
--- a/icu4c/source/common/unicode/uconfig.h
+++ b/icu4c/source/common/unicode/uconfig.h
@@ -323,6 +323,16 @@
 #   define UCONFIG_NO_NORMALIZATION 0
 #endif
 
+/**
+ * \def UCONFIG_USE_ML_PHRASE_BREAKING
+ * This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
+ *
+ * @internal
+ */
+#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
+#   define UCONFIG_USE_ML_PHRASE_BREAKING 0
+#endif
+
 #if UCONFIG_NO_NORMALIZATION
     /* common library */
     /* ICU 50 CJK dictionary BreakIterator uses normalization */
diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py
index 899cba25b48..2608cb0227b 100644
--- a/icu4c/source/data/BUILDRULES.py
+++ b/icu4c/source/data/BUILDRULES.py
@@ -27,6 +27,7 @@ def generate(config, io, common_vars):
     requests += generate_conversion_mappings(config, io, common_vars)
     requests += generate_brkitr_brk(config, io, common_vars)
     requests += generate_brkitr_lstm(config, io, common_vars)
+    requests += generate_brkitr_adaboost(config, io, common_vars)
     requests += generate_stringprep(config, io, common_vars)
     requests += generate_brkitr_dictionaries(config, io, common_vars)
     requests += generate_normalization(config, io, common_vars)
@@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars):
             category = "brkitr_rules",
             dep_targets =
                 [DepTarget("cnvalias"),
-                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
+                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
             input_files = input_files,
             output_files = output_files,
             tool = IcuTool("genbrk"),
@@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars):
         )
     ]
 
+def generate_brkitr_adaboost(config, io, common_vars):
+    input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
+    input_basenames = [v.filename[16:] for v in input_files]
+    output_files = [
+        OutFile("brkitr/%s.res" % v[:-4])
+        for v in input_basenames
+    ]
+    return [
+        RepeatedOrSingleExecutionRequest(
+            name = "adaboost_res",
+            category = "brkitr_adaboost",
+            dep_targets = [],
+            input_files = input_files,
+            output_files = output_files,
+            tool = IcuTool("genrb"),
+            args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
+                "-k "
+                "{INPUT_BASENAME}",
+            format_with = {
+            },
+            repeat_with = {
+                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
+            }
+        )
+    ]
+
 def generate_tree(
         config,
         io,
diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt
new file mode 100644
index 00000000000..0500ff73fbf
--- /dev/null
+++ b/icu4c/source/data/brkitr/adaboost/jaml.txt
@@ -0,0 +1,940 @@
+// Â© 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+jaml {
+    modelKeys {
+        "BB2:062071",
+        "UB3:061",
+        "UB3:071",
+        "TB2:062062062",
+        "TB4:062062062",
+        "UB3:063",
+        "UB4:071",
+        "BB3:062062",
+        "UB4:062",
+        "BB1:062071",
+        "BB1:062061",
+        "UB4:061",
+        "TB1:071071062",
+        "TB3:062063063",
+        "UB2:061",
+        "TB1:062071062",
+        "TB3:062062062",
+        "BB2:063063",
+        "UW3:ã¯",
+        "UW3:ã«",
+        "TB3:062071062",
+        "UW3:ã",
+        "UW4:ã",
+        "UB5:061",
+        "UW3:ã¨",
+        "TB4:063063063",
+        "UW4:ã¦",
+        "TB2:062062061",
+        "UW3:ã",
+        "UW4:ã",
+        "UW3:ã®",
+        "BB3:071071",
+        "BB3:062071",
+        "UW3:ã",
+        "UW3:ã",
+        "UW4:ã",
+        "UW4:ã®",
+        "UW3:ã",
+        "UW4:ã",
+        "UW3:ã",
+        "UW5:ã§",
+        "UW4:ã",
+        "BB2:062062",
+        "UW4:ã£",
+        "UW5:ã£",
+        "UW3:ã",
+        "UW5:ã",
+        "UW3:ã",
+        "UW5:ãª",
+        "UW4:ã",
+        "UW4:ã",
+        "UW3:ã£",
+        "UW4:ã",
+        "UW4:ã",
+        "BB2:087087",
+        "UB4:087",
+        "UW5:ã«",
+        "BW3:ãã®",
+        "UW5:ã",
+        "UW6:ã",
+        "BW2:ã¨ã",
+        "UW4:ã«",
+        "UW3:ã",
+        "TB2:071062071",
+        "UW4:ã§",
+        "UW5:ã",
+        "BB1:071071",
+        "UW5:ã¯",
+        "UW4:ã¯",
+        "UW4:ã",
+        "UW5:ã",
+        "BB2:071062",
+        "BB2:071071",
+        "UW3:ã»",
+        "BB2:071087",
+        "BB2:061062",
+        "TB1:062061062",
+        "UW3:ã",
+        "BB2:087062",
+        "TB2:087087087",
+        "UW4:ã",
+        "TB1:071071071",
+        "UB2:071",
+        "TB1:062062087",
+        "UW5:ã",
+        "UW5:ã",
+        "UW3:ã§",
+        "UW4:ã",
+        "UW3:ã",
+        "TB4:071062062",
+        "UW3:ã",
+        "UW6:ã«",
+        "UW6:ã",
+        "UW3:ã",
+        "TB1:061071071",
+        "UW5:ã",
+        "UB1:063",
+        "UW1:ã",
+        "UW3:ã",
+        "BW3:ã¨ã",
+        "BW3:ã¨ã",
+        "UW3:ã¾",
+        "BW3:ãã¨",
+        "UW2:ã£",
+        "UW5:ã»",
+        "TB3:062062061",
+        "UW3:ã",
+        "UW4:ã",
+        "UB3:062",
+        "UW3:ã",
+        "UW3:ã",
+        "UW5:ã",
+        "BB2:062087",
+        "BW3:ãã",
+        "UW5:ã",
+        "UW2:ä¸",
+        "UW3:ï¼",
+        "UW1:ã«",
+        "UW2:ã¨",
+        "TB2:071071062",
+        "TB2:071071071",
+        "UW5:ã",
+        "UW4:ã",
+        "BW1:ãã",
+        "UW3:ã¡",
+        "BW3:ãã",
+        "UW2:ã¯",
+        "UW6:ã",
+        "TB1:063063062",
+        "UW4:ï¼",
+        "UW4:ã",
+        "UW2:ã",
+        "UW3:ï¼½",
+        "UW4:ã»",
+        "TB3:062087087",
+        "BW2:ã§ã",
+        "UW4:ã ",
+        "BB3:071062",
+        "TB1:087087087",
+        "BW3:ã»ã»",
+        "BW3:ã¨ã",
+        "UW4:ã",
+        "UW3:ã¦",
+        "UW4:ã",
+        "UW2:ã",
+        "TB4:071071062",
+        "TB2:062061071",
+        "UW2:ã",
+        "UW4:ã",
+        "UW2:ã§",
+        "TB3:071071071",
+        "BB1:087087",
+        "UW2:ã",
+        "UW4:åº",
+        "UW2:ã¾",
+        "UW4:ï¼",
+        "UW5:ã¨",
+        "UW4:ã©",
+        "BW3:ãã¦",
+        "UW1:ã§",
+        "BB2:061071",
+        "BW3:ãã",
+        "BW2:ã¨ã",
+        "BW2:ãªã",
+        "BW2:ã¦ã",
+        "UW3:é",
+        "UW3:ï¼",
+        "UW5:ã¼",
+        "UW4:ã",
+        "UW4:ï¼",
+        "BW1:ã¨ã",
+        "UW5:ã®",
+        "TB4:062062071",
+        "TB2:061071071",
+        "UW6:ã»",
+        "UW3:ï¼",
+        "UW2:ã¦",
+        "UW3:ç¬",
+        "UW2:ã",
+        "UW5:ã",
+        "BW3:ãã",
+        "UW3:äºº",
+        "UW2:ã®",
+        "UW3:ã",
+        "UW3:æ¥",
+        "UW1:ã",
+        "BW2:ã¨ã",
+        "UW4:ç§",
+        "UW3:â¦",
+        "UW2:ã«",
+        "UW3:ä»",
+        "BB3:087062",
+        "UB3:055",
+        "UW4:ï¼",
+        "BB1:087071",
+        "UW1:ãª",
+        "BB3:063063",
+        "UW5:æ¥",
+        "UW3:ï¼",
+        "TW3:ã¦ãã",
+        "UW4:ã",
+        "UW4:å",
+        "BW1:ãã",
+        "UW4:ã¤",
+        "UW3:ï¼",
+        "BW1:ã§ã¯",
+        "UW2:ã",
+        "UW5:ã",
+        "UW4:ã¼",
+        "TW2:æ°ã«å¥",
+        "UW4:ç¬",
+        "UW4:ã²",
+        "TB4:087087087",
+        "UW4:ã",
+        "UW2:ã",
+        "BW3:ã¡ã",
+        "BW3:åºæ¥",
+        "TB2:062071062",
+        "UW4:ã",
+        "UW3:ï¼»",
+        "UW4:ï¼",
+        "UW5:ã¤",
+        "TB1:061071062",
+        "UW3:ï¼",
+        "BW3:ãã",
+        "UB5:071",
+        "UW4:ã¾",
+        "UW3:ã°",
+        "UW3:ã",
+        "BW3:ãã®",
+        "UW3:ã",
+        "UW4:ã",
+        "BW2:ã¦ã",
+        "TB2:071062062",
+        "BW1:ãªã",
+        "UW2:ã",
+        "UB2:087",
+        "UW6:ã®",
+        "UW2:æ¯",
+        "UW2:çµ",
+        "TW4:ã®äº¬é½",
+        "UW3:ã",
+        "UW2:æ",
+        "BW2:ã§ã",
+        "UW2:ã",
+        "UW5:ã",
+        "UW3:ã ",
+        "TW4:ã¨ãã",
+        "UW4:ï¼",
+        "UB1:062",
+        "UW6:ã¦",
+        "UW1:ã",
+        "BW2:ãã¨",
+        "UW3:ï¼",
+        "UW3:ã",
+        "UW3:ä¸­",
+        "UW4:ã",
+        "BW3:ãã®",
+        "UW2:ã",
+        "UW3:ã¿",
+        "TW2:ã§ã¯ãª",
+        "UW6:ã¨",
+        "UW4:ï¼»",
+        "TW3:ããã",
+        "BW3:ãã",
+        "UW4:ï¼",
+        "UW6:ã",
+        "UW4:é»",
+        "BB1:062040",
+        "UW3:å¾",
+        "UW5:ã",
+        "UW2:ã",
+        "UW5:ã¦",
+        "BB2:062040",
+        "UW3:ç",
+        "UW3:ã",
+        "UW5:ã",
+        "UB5:087",
+        "TW3:ã¨ãã",
+        "UW3:å",
+        "UB6:071",
+        "BW3:ãªã£",
+        "UW4:ã",
+        "BB2:061061",
+        "TW3:ã¨ãã",
+        "UB1:071",
+        "UW1:ã",
+        "BW1:ã¨ã",
+        "UW3:ãª",
+        "UW6:ã",
+        "UW4:é",
+        "UW3:ã¹",
+        "UW5:ã¹",
+        "TB4:062071062",
+        "UW4:ï¼½",
+        "BW2:ã«ã¯",
+        "UW5:ã",
+        "BW1:ãã»",
+        "BW1:ãã®",
+        "UW1:ã",
+        "UW4:ï¼",
+        "UW6:ã£",
+        "TB3:063063063",
+        "TB3:062071071",
+        "UB5:063",
+        "BW1:ãã",
+        "UW6:ã",
+        "TB4:062063063",
+        "UW3:ã©",
+        "TW3:ã§ãã",
+        "TW4:ããã",
+        "BW1:æè¿",
+        "BW1:ãã",
+        "BW1:ã¨ã",
+        "BW2:ã¨å",
+        "TW1:ã¨ãã",
+        "UW2:ã",
+        "BW2:å¸¯é»",
+        "TB1:071062062",
+        "BW3:ãã",
+        "UW2:ã",
+        "UW5:ã",
+        "UW5:ã",
+        "BW3:ãªã",
+        "BW1:ããª",
+        "BW2:ã§ã",
+        "UW4:ï¼",
+        "UW3:ã",
+        "TW4:ãã¨ã",
+        "BW1:ãã¨",
+        "UB3:087",
+        "UW3:é»",
+        "UW3:ã",
+        "BW1:ãã¨",
+        "UW5:ã¾",
+        "UW5:ã",
+        "UW5:ã¡",
+        "UW2:ã",
+        "UW5:ã ",
+        "UW3:åº¦",
+        "BW1:ãã",
+        "UW4:ä½¿",
+        "UW2:ã",
+        "TW4:ããªã",
+        "UB6:063",
+        "BB1:062062",
+        "UW4:è¾¼",
+        "TW3:ã¨è¨ã£",
+        "UW6:ã ",
+        "UW5:ã",
+        "UW5:ã",
+        "BW3:ã©ã",
+        "UW4:â¦",
+        "UW3:ã",
+        "BW1:ãã",
+        "BW3:ãã£",
+        "UW4:ä»",
+        "UW3:ã",
+        "UW4:æ",
+        "UB2:063",
+        "UW4:ã",
+        "UW3:äº¬",
+        "UW6:ã¼",
+        "UW1:ã",
+        "BW1:ããª",
+        "TB2:062061061",
+        "UW1:ã¨",
+        "TB4:062063062",
+        "TB2:061062062",
+        "BW1:ãã®",
+        "BW2:ã®ã§",
+        "UW4:ã¿",
+        "UW5:ã",
+        "UW6:ã",
+        "BW1:ãã¦",
+        "UW2:ã",
+        "UW6:ã",
+        "UW4:ãª",
+        "UW5:ã",
+        "BW1:ãã",
+        "TB4:071062071",
+        "BW1:ãã",
+        "UW4:å",
+        "UW6:ã",
+        "BW1:å°ã",
+        "BW2:ã§ã",
+        "UW4:ã¨",
+        "TB1:063063063",
+        "UW3:ã¼",
+        "BW2:ããª",
+        "UW2:ã",
+        "UW2:æ",
+        "BW2:ãã",
+        "BW3:ãã",
+        "TB2:071063071",
+        "UW4:ã",
+        "UW1:ã",
+        "UW4:æ",
+        "BW1:ãã®",
+        "UW2:å¨",
+        "UW6:ï¼",
+        "UW4:æ¾",
+        "UW4:äº¬",
+        "BW3:ãã",
+        "UW2:å°",
+        "BW3:ãã",
+        "UW2:å¤",
+        "UW2:ã",
+        "TB1:062062040",
+        "UW1:ã",
+        "UW3:å",
+        "BW1:ï¼ï¼",
+        "UW2:ã£",
+        "BW3:ãã",
+        "UW4:å¸¯",
+        "UW6:ã",
+        "BW3:ã§ã",
+        "BW2:ãã",
+        "TB3:071087087",
+        "TB2:063062071",
+        "UW3:ã",
+        "UB4:063",
+        "TB4:071071071",
+        "UW5:é½",
+        "UW5:ã",
+        "UW2:ã",
+        "UW2:äº¬",
+        "UW3:ã",
+        "BW1:ãã",
+        "BW3:ãã",
+        "BW1:ãã",
+        "BW2:ã®ã",
+        "UW2:æ",
+        "BW1:ãã«",
+        "BW1:ã®é",
+        "UW6:ã",
+        "UW6:ã",
+        "BW1:ã£ã",
+        "TW3:ããã¨",
+        "BW3:ã¨ã¦",
+        "TW1:ãããª",
+        "UW6:ã±",
+        "TB3:063071062",
+        "TW4:ã£ã¦ã",
+        "TW4:ãªãã¦",
+        "TW2:ãã®å¾",
+        "UW6:ã",
+        "TW4:ãã¨ã«",
+        "UW3:ï¼",
+        "TW3:ã¦ãã¾",
+        "UW3:ã",
+        "TB4:071062061",
+        "UW2:ã²",
+        "UW6:ã",
+        "UW6:ã§",
+        "BW3:ãªã",
+        "UW5:ã",
+        "BW2:ãã",
+        "UW6:é»",
+        "UW1:ã¯",
+        "BW1:ãã",
+        "BW3:ãã",
+        "UW4:é",
+        "BW3:ãã",
+        "BW3:ãã",
+        "BW1:æã",
+    }
+    modelValues:intvector {
+        1800,
+        271,
+        -857,
+        -417,
+        285,
+        -583,
+        388,
+        828,
+        -853,
+        -820,
+        502,
+        -708,
+        358,
+        1341,
+        -586,
+        -451,
+        257,
+        -1876,
+        2052,
+        1698,
+        -458,
+        2048,
+        1182,
+        -551,
+        980,
+        773,
+        -1453,
+        -152,
+        3201,
+        2865,
+        1203,
+        144,
+        -369,
+        -2539,
+        -613,
+        -3574,
+        -1111,
+        3110,
+        -3022,
+        2039,
+        -1091,
+        1241,
+        -560,
+        -1412,
+        625,
+        1350,
+        297,
+        -2404,
+        -595,
+        1007,
+        -1829,
+        -1662,
+        3213,
+        270,
+        -911,
+        178,
+        -727,
+        2716,
+        -484,
+        -344,
+        929,
+        -1236,
+        760,
+        -299,
+        -419,
+        -728,
+        122,
+        -704,
+        -605,
+        -1507,
+        545,
+        -68,
+        -320,
+        1498,
+        953,
+        -323,
+        -575,
+        -673,
+        520,
+        -450,
+        -1767,
+        -247,
+        56,
+        231,
+        -764,
+        536,
+        794,
+        -703,
+        -566,
+        51,
+        390,
+        52,
+        -182,
+        466,
+        133,
+        354,
+        107,
+        492,
+        488,
+        -1194,
+        1145,
+        -847,
+        812,
+        151,
+        -517,
+        -314,
+        -553,
+        -783,
+        -117,
+        736,
+        -88,
+        -598,
+        569,
+        606,
+        287,
+        744,
+        1739,
+        -217,
+        -219,
+        -144,
+        234,
+        -649,
+        -757,
+        834,
+        -819,
+        869,
+        -275,
+        -267,
+        154,
+        653,
+        594,
+        255,
+        1018,
+        1124,
+        284,
+        -1624,
+        -372,
+        440,
+        -184,
+        -1936,
+        1318,
+        -1124,
+        453,
+        -92,
+        -343,
+        175,
+        182,
+        -886,
+        930,
+        -223,
+        -57,
+        -113,
+        103,
+        -200,
+        510,
+        -2099,
+        -498,
+        385,
+        80,
+        -156,
+        360,
+        1289,
+        771,
+        -1114,
+        -399,
+        870,
+        1230,
+        79,
+        472,
+        -1596,
+        -1092,
+        -572,
+        55,
+        -151,
+        -124,
+        1316,
+        -248,
+        1280,
+        -125,
+        -284,
+        -1023,
+        862,
+        84,
+        417,
+        568,
+        -88,
+        -528,
+        910,
+        674,
+        -212,
+        894,
+        -121,
+        1108,
+        762,
+        260,
+        -197,
+        91,
+        -53,
+        1117,
+        -645,
+        -868,
+        -611,
+        220,
+        422,
+        1431,
+        -532,
+        -157,
+        -476,
+        -846,
+        -1309,
+        -1614,
+        1225,
+        302,
+        -738,
+        -260,
+        892,
+        -778,
+        -193,
+        1221,
+        -779,
+        489,
+        420,
+        -85,
+        -525,
+        -830,
+        26,
+        270,
+        439,
+        -120,
+        1263,
+        -795,
+        291,
+        -1310,
+        -23,
+        347,
+        312,
+        -107,
+        -114,
+        701,
+        830,
+        1309,
+        -451,
+        260,
+        -1080,
+        536,
+        188,
+        -60,
+        643,
+        -1184,
+        31,
+        -194,
+        -51,
+        -514,
+        -442,
+        -120,
+        649,
+        410,
+        882,
+        -75,
+        -341,
+        -718,
+        -128,
+        340,
+        -1245,
+        -164,
+        -1052,
+        70,
+        -256,
+        279,
+        786,
+        40,
+        -177,
+        97,
+        -411,
+        222,
+        -89,
+        -277,
+        -146,
+        414,
+        483,
+        21,
+        -339,
+        -406,
+        -360,
+        -450,
+        -14,
+        -36,
+        513,
+        252,
+        54,
+        -501,
+        -478,
+        450,
+        -36,
+        -644,
+        -392,
+        714,
+        643,
+        -341,
+        91,
+        -1018,
+        34,
+        -177,
+        123,
+        80,
+        -695,
+        -44,
+        -357,
+        253,
+        -389,
+        613,
+        515,
+        418,
+        -396,
+        -553,
+        193,
+        298,
+        -334,
+        -57,
+        -315,
+        -77,
+        33,
+        88,
+        137,
+        280,
+        -448,
+        196,
+        -136,
+        -295,
+        -329,
+        -92,
+        -360,
+        -132,
+        -288,
+        -45,
+        -43,
+        174,
+        75,
+        -60,
+        330,
+        360,
+        217,
+        130,
+        473,
+        -41,
+        -23,
+        -340,
+        -530,
+        -69,
+        -71,
+        -115,
+        297,
+        -240,
+        229,
+        507,
+        -348,
+        171,
+        -320,
+        239,
+        16,
+        -195,
+        -277,
+        -41,
+        69,
+        280,
+        -264,
+        30,
+        249,
+        -97,
+        -163,
+        -221,
+        96,
+        83,
+        82,
+        -218,
+        -93,
+        -53,
+        40,
+        28,
+        285,
+        27,
+        283,
+        -211,
+        -92,
+        214,
+        -225,
+        -54,
+        53,
+        105,
+        -198,
+        -53,
+        -277,
+        198,
+        184,
+        -264,
+        -106,
+        14,
+        185,
+        -155,
+        185,
+        106,
+        -119,
+        53,
+        208,
+        92,
+        262,
+        106,
+        -52,
+        105,
+        -25,
+        -79,
+        104,
+        141,
+        129,
+        -114,
+        26,
+        64,
+        -113,
+        26,
+        77,
+        -64,
+        13,
+        13,
+        26,
+        89,
+        115,
+        -49,
+        89,
+        -114,
+        51,
+        64,
+        -64,
+        -51,
+        -38,
+        89,
+        13,
+        -64,
+        13,
+        -48,
+        76,
+        63,
+        62,
+        13,
+        112,
+        -76,
+        -50,
+        -13,
+        -49,
+        63,
+        -50,
+        13,
+        13,
+        -50,
+        24,
+        -12,
+        24,
+        12,
+        24,
+        12,
+        -12,
+        -24,
+        12,
+        -12,
+        -12,
+        12,
+        -12,
+    }
+}
\ No newline at end of file
diff --git a/icu4c/source/python/icutools/databuilder/filtration.py b/icu4c/source/python/icutools/databuilder/filtration.py
index 27d08b0a772..e9339a08955 100644
--- a/icu4c/source/python/icutools/databuilder/filtration.py
+++ b/icu4c/source/python/icutools/databuilder/filtration.py
@@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io):
     default_filter_json = "exclude" if config.strategy == "additive" else "include"
     for category in all_categories:
         filter_json = default_filter_json
-        # Special default for category "brkitr_lstm" as "exclude" for now.
-        if "brkitr_lstm" == category:
+        # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
+        if "brkitr_lstm" == category or "brkitr_adaboost" == category:
             filter_json = "exclude"
         # Figure out the correct filter to create for now.
         if "featureFilters" in json_data and category in json_data["featureFilters"]:
diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt
index 9676ed4856d..7460caa7cdd 100644
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -211,7 +211,7 @@ group: breakiterator
     brkiter.o brkeng.o ubrk.o
     rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
     rbbidata.o rbbirb.o rbbi_cache.o
-    dictionarydata.o dictbe.o lstmbe.o
+    dictionarydata.o dictbe.o lstmbe.o mlbe.o
     # BreakIterator::makeInstance() factory implementation makes for circular dependency
     # between BreakIterator base and FilteredBreakIteratorBuilder.
     filteredbrk.o
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 17c05fb0d44..7afdb9ab828 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -42,6 +42,7 @@
 #include "charstr.h"
 #include "cmemory.h"
 #include "cstr.h"
+#include "cstring.h"
 #include "intltest.h"
 #include "lstmbe.h"
 #include "rbbitst.h"
@@ -835,9 +836,28 @@ void RBBITest::TestExtended() {
                 delete tp.bi;
                 tp.bi = BreakIterator::createLineInstance(locale,  status);
                 skipTest = false;
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+                if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+                    // skip <line> test cases of JP's phrase breaking when ML is enabled.
+                    skipTest = true;
+                }
+#endif
                 charIdx += 5;
                 break;
             }
+            if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
+                delete tp.bi;
+                tp.bi = BreakIterator::createLineInstance(locale,  status);
+                skipTest = false;
+#if !UCONFIG_USE_ML_PHRASE_BREAKING
+                if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+                    // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
+                    skipTest = true;
+                }
+#endif
+                charIdx += 7;
+                break;
+            }
             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
                 delete tp.bi;
                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 72bd15803d6..40c6745dd06 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1913,6 +1913,26 @@ Bangkok)â¢</data>
 <data>â¢\U0001F469\u200D\U0001F680â¢\U0001F469\U0001F3FD\u200D\U0001F680\u0020â¢</data>
 
 <locale ja@lw=phrase>
+#phrase breaking test cases for the ML solution
+<lineML>
+#ï¼æã«æ±äº¬ããåéãéã³ã«æ¥ã -> ï¼æã«â¢æ±äº¬ããâ¢åéãâ¢éã³ã«â¢æ¥ãâ¢
+<data>â¢\uff19\u6708\u306bâ¢\u6771\u4eac\u304b\u3089â¢\u53cb\u9054\u304câ¢\u904a\u3073\u306bâ¢\u6765\u305fâ¢</data>
+#ãæå­ãããã ãäº¬é½ã-> ãâ¢æå­â¢ãããã ãâ¢äº¬é½ãâ¢
+<data>â¢\u308bâ¢\u6587\u5b57â¢\u300c\u305d\u3046\u3060\u3001â¢\u4eac\u90fd\u300dâ¢</data>
+#Kana supplement: ð(U+1B048) -> \uD82C\uDC48, ð¸(U+1B038) -> \uD82C\uDC38, ð(U+1B019)-> \uD82C\uDC19</data>
+#ðãð¸ï¼ãããï¼ããðãããï¼ãããããï¼ -> ðãð¸â¢ï¼ãããï¼ãâ¢ãðãããâ¢ï¼ãããããï¼
+<data>â¢\U0001B048\u308B\U0001B038â¢\uFF08\u3057\u308B\u3053\uFF09\u3001â¢\u3042\U0001B019\u3088\u308D\u3057â¢\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09â¢</data>
+#ä¸­å½ã®æºå¸¯ã¯ç´ï¼ï¼ï¼åããï¼ï¼ï¼ï¼åã§ã -> ä¸­å½ã®âæºå¸¯ã¯âç´âï¼ï¼ï¼åããâï¼ï¼ï¼ï¼åã§ã
+<data>â¢\u4E2D\u56FD\u306Eâ¢\u643A\u5E2F\u306Fâ¢\u7D04â¢\uFF15\uFF10\uFF10\u5143\u304B\u3089â¢\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059â¢</data>
+#ãããã­ã´ãï¼µï½ï½ï½ï½ï½ï½ï¼ï¼ -> ãããâã­ã´ãâï¼µï½ï½ï½ï½ï½ï½ï¼ï¼
+<data>â¢\u3057\u304B\u3082â¢\u30ED\u30B4\u304Câ¢\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01â¢</data>
+#ããããªã¼ãé·æã¡ãããåé»ãæé©åãã¾ã -> ããããªã¼ãâé·æã¡ãããâåé»ãâæé©åãã¾ã
+<data>â¢\u30D0\u30C3\u30C6\u30EA\u30FC\u3092â¢\u9577\u6301\u3061\u3055\u305B\u3001â¢\u5145\u96FB\u3092â¢\u6700\u9069\u5316\u3057\u307E\u3059â¢</data>
+#ãã¼ã¿ã®ã³ãã¼ãã¹ãã¼ããã©ã³ã§ã®ãæ¯æããªã© -> ãã¼ã¿ã®âã³ãã¼ãâã¹ãã¼ããã©ã³ã§ã®âãæ¯æããªã©
+<data>â¢\u30C7\u30FC\u30BF\u306Eâ¢\u30B3\u30D4\u30FC\u3001â¢\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306Eâ¢\u304A\u652F\u6255\u3044\u306A\u3069â¢</data>
+
+<locale ja@lw=phrase>
+#phrase breaking test cases for the dictionary based solution
 <line>
 #[äº¬é½è¦³åï¼½æé¨æ®¿ã«è¡ã£ãã-> [äº¬é½â¢è¦³åï¼½â¢æé¨â¢æ®¿ã«â¢è¡ã£ããâ¢
 <data>â¢\uff3b\u4eac\u90fdâ¢\u89b3\u5149\uff3dâ¢\u6642\u96e8â¢\u6bbf\u306bâ¢\u884c\u3063\u305f\u3002â¢</data>
@@ -2005,8 +2025,8 @@ Bangkok)â¢</data>
 #å¤§éæ°åì æ°ä¸»å±ååì´ë¤
 #<data>â¢å¤§éæ°åì â¢æ°ä¸»â¢å±ååì´ë¤â¢</data>
 # All the tests for ja@lw=phrase should also work in Korean.
-#[äº¬é½è¦³åï¼½æé¨æ®¿ã«è¡ã£ãã-> [äº¬é½â¢è¦³åï¼½â¢æé¨â¢æ®¿ã«â¢è¡ã£ããâ¢
-<data>â¢\uff3b\u4eac\u90fdâ¢\u89b3\u5149\uff3dâ¢\u6642\u96e8â¢\u6bbf\u306bâ¢\u884c\u3063\u305f\u3002â¢</data>
+#ãæå­ãããã ãäº¬é½ã-> ãâ¢æå­â¢ãããã ãâ¢äº¬é½ãâ¢
+<data>â¢\u308bâ¢\u6587\u5b57â¢\u300c\u305d\u3046\u3060\u3001â¢\u4eac\u90fd\u300dâ¢</data>
 #ï¼æã«æ±äº¬ããåéãéã³ã«æ¥ã -> ï¼æã«â¢æ±äº¬ããâ¢åéãâ¢éã³ã«â¢æ¥ãâ¢
 <data>â¢\uff19\u6708\u306bâ¢\u6771\u4eac\u304b\u3089â¢\u53cb\u9054\u304câ¢\u904a\u3073\u306bâ¢\u6765\u305fâ¢</data>
 
-- 
2.40.0