ICU-22100 Incorporate BudouX into ICU (C++)

author Shuhei Iitsuka <tushuhei@google.com>

Fri, 29 Jul 2022 04:08:01 +0000 (12:08 +0800)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 2 Dec 2022 18:11:06 +0000 (10:11 -0800)
author Shuhei Iitsuka <tushuhei@google.com>
Fri, 29 Jul 2022 04:08:01 +0000 (12:08 +0800)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 2 Dec 2022 18:11:06 +0000 (10:11 -0800)
diff --git a/.github/adaboost.json b/.github/adaboost.json

new file mode 100644 (file)

index 0000000..639fd6a
--- /dev/null
+++ b/.github/adaboost.json
@@ -0,0 +1,14 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+//
+// Include Japanese adaboost model.
+{
+  "featureFilters": {
+    "brkitr_adaboost": {
+      "includelist": [
+           "jaml"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml

index 90bce1ed542c07b3b3a03582f381d175a27d98b5..1293e5edbb0cf4e36a90a471c62ff777cc4a8e9a 100644 (file)
--- a/.github/workflows/icu_ci.yml
+++ b/.github/workflows/icu_ci.yml
@@ -334,6 +334,17 @@ jobs:
            make clean;
            make -j2 check
  
+  # Test adaboost
+  adaboost-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - run: |
+          cd icu4c/source;
+          ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+          make clean;
+          make -j2 check
+
    # Build and run testmap
    testmap:
      runs-on: ubuntu-latest
diff --git a/icu4c/source/common/BUILD.bazel b/icu4c/source/common/BUILD.bazel

index e385d3b243faeb279d3bb4aee22d5e6e0cd8bd67..47d3d24bf5186ee815157a43300ec3cdf675a6a1 100644 (file)
--- a/icu4c/source/common/BUILD.bazel
+++ b/icu4c/source/common/BUILD.bazel
@@ -342,6 +342,7 @@ cc_library(
          "dictionarydata.cpp",
          "filteredbrk.cpp",
          "lstmbe.cpp",
+        "mlbe.cpp",
          "rbbi.cpp",
          "rbbi_cache.cpp",
          "rbbidata.cpp",
diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj

index e35e1b0cff758740eb2fd5d8578187ab8553fc43..2b4cc05357acabeb90dcb9d2e5a7ade2279056bd 100644 (file)
--- a/icu4c/source/common/common.vcxproj
+++ b/icu4c/source/common/common.vcxproj
@@ -88,6 +88,7 @@
      <ClCompile Include="brkiter.cpp" />
      <ClCompile Include="dictbe.cpp" />
      <ClCompile Include="lstmbe.cpp" />
+    <ClCompile Include="mlbe.cpp" />
      <ClCompile Include="pluralmap.cpp" />
      <ClCompile Include="rbbi.cpp" />
      <ClCompile Include="rbbidata.cpp" />
@@ -282,6 +283,7 @@
      <ClInclude Include="brkeng.h" />
      <ClInclude Include="dictbe.h" />
      <ClInclude Include="lstmbe.h" />
+    <ClInclude Include="mlbe.h" />
      <ClInclude Include="rbbidata.h" />
      <ClInclude Include="rbbinode.h" />
      <ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters

index 38bc0c1b869430b6e54757c271b30a065583c8b1..28a5d903429fd0d040a175311fd0852fdcf6c6a7 100644 (file)
--- a/icu4c/source/common/common.vcxproj.filters
+++ b/icu4c/source/common/common.vcxproj.filters
@@ -76,6 +76,9 @@
      <ClCompile Include="lstmbe.cpp">
        <Filter>break iteration</Filter>
      </ClCompile>
+    <ClCompile Include="mlbe.cpp">
+      <Filter>break iteration</Filter>
+    </ClCompile>
      <ClCompile Include="rbbi.cpp">
        <Filter>break iteration</Filter>
      </ClCompile>
@@ -660,6 +663,9 @@
      <ClInclude Include="lstmbe.h">
        <Filter>break iteration</Filter>
      </ClInclude>
+    <ClInclude Include="mlbe.h">
+      <Filter>break iteration</Filter>
+    </ClInclude>
      <ClInclude Include="rbbidata.h">
        <Filter>break iteration</Filter>
      </ClInclude>
diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj

index fc165629f295297fceb540961749aca2b5db1fca..5df0d57a7de41d8f702c62f9179a0a6cf7fb6729 100644 (file)
--- a/icu4c/source/common/common_uwp.vcxproj
+++ b/icu4c/source/common/common_uwp.vcxproj
@@ -222,6 +222,7 @@
      <ClCompile Include="brkiter.cpp" />
      <ClCompile Include="dictbe.cpp" />
      <ClCompile Include="lstmbe.cpp" />
+    <ClCompile Include="mlbe.cpp" />
      <ClCompile Include="pluralmap.cpp" />
      <ClCompile Include="rbbi.cpp" />
      <ClCompile Include="rbbidata.cpp" />
@@ -417,6 +418,7 @@
      <ClInclude Include="brkeng.h" />
      <ClInclude Include="dictbe.h" />
      <ClInclude Include="lstmbe.h" />
+    <ClInclude Include="mlbe.h" />
      <ClInclude Include="rbbidata.h" />
      <ClInclude Include="rbbinode.h" />
      <ClInclude Include="rbbirb.h" />
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp

index 9b5434d995a4384927c3e8f299a3a586ed98134f..0e420c67c5d2ac1fbeb4da013979cb43e0463ba7 100644 (file)
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1054,9 +1054,10 @@ foundBest:
   */
  static const uint32_t kuint32max = 0xFFFFFFFF;
  CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
-: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
+: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
      UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
      UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
+    fMlBreakEngine = nullptr;
      nfkcNorm2 = Normalizer2::getNFKCInstance(status);
      // Korean dictionary only includes Hangul syllables
      fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
@@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
          if (U_SUCCESS(status)) {
              setCharacters(fHangulWordSet);
          }
-    } else { //Chinese and Japanese
+    } else { // Chinese and Japanese
          UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+        isCj = true;
          if (U_SUCCESS(status)) {
              setCharacters(cjSet);
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+            fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
+                                               fClosePunctuationSet, status);
+            if (fMlBreakEngine == nullptr) {
+                status = U_MEMORY_ALLOCATION_ERROR;
+            }
+#else
              initJapanesePhraseParameter(status);
+#endif
          }
      }
      UTRACE_EXIT_STATUS(status);
@@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
  
  CjkBreakEngine::~CjkBreakEngine(){
      delete fDictionary;
+    delete fMlBreakEngine;
  }
  
  // The katakanaCost values below are based on the length frequencies of all
@@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
              }
          }
      }
-                
+
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+    // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
+    if (isPhraseBreaking && isCj) {
+        return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
+                                             inputMap, status);
+    }
+#endif
+
      // bestSnlp[i] is the snlp of the best segmentation of the first i
      // code points in the range to be matched.
      UVector32 bestSnlp(numCodePts + 1, status);
diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h

index ca1a3c28b7be80b78b1ae1e677c632cf164444b6..a2c761bdc3ac39d5290b9c664b643689101e6b62 100644 (file)
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@@ -16,11 +16,13 @@
  
  #include "brkeng.h"
  #include "hash.h"
+#include "mlbe.h"
  #include "uvectr32.h"
  
  U_NAMESPACE_BEGIN
  
  class DictionaryMatcher;
+class MlBreakEngine;
  class Normalizer2;
  
  /*******************************************************************
@@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
  
    DictionaryMatcher        *fDictionary;
    const Normalizer2        *nfkcNorm2;
+  MlBreakEngine            *fMlBreakEngine;
+  bool                      isCj;
  
   private:
    // Load Japanese extensions.
diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp

new file mode 100644 (file)

index 0000000..3ccf470
--- /dev/null
+++ b/icu4c/source/common/mlbe.cpp
@@ -0,0 +1,452 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+#include "cmemory.h"
+#include "mlbe.h"
+#include "uassert.h"
+#include "ubrkimpl.h"
+#include "unicode/resbund.h"
+#include "unicode/udata.h"
+#include "unicode/utf16.h"
+#include "uresimp.h"
+#include "util.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+Element::Element() : length(0) {}
+
+void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
+    character = ch;
+    U_ASSERT(idx.length() <= 3);
+    length = idx.length();
+    idx.extract(0, length, ublock);
+    ublock[length] = '\0';
+}
+
+UChar32 Element::getCharacter() const {
+    return character;
+}
+
+char16_t* Element::getUblock() const {
+    return (char16_t*)ublock;
+}
+
+uint16_t Element::getLength() const {
+    return length;
+}
+
+MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+                                 const UnicodeSet &closePunctuationSet, UErrorCode &status)
+    : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
+      fClosePunctuationSet(closePunctuationSet),
+      fModel(status),
+      fNegativeSum(0) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    loadMLModel(status);
+}
+
+MlBreakEngine::~MlBreakEngine() {}
+
+namespace {
+    const char16_t INVALID = u'|';
+    const int32_t MAX_FEATURE = 26;
+    const int32_t MAX_FEATURE_LENGTH = 14;
+
+    bool isValid(const Element& element) {
+        return element.getLength() != 1 || element.getUblock()[0] != INVALID;
+    }
+
+    void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
+        if (U_FAILURE(status)) {
+            return;
+        }
+        UnicodeString result(str);
+        for (int i = 0; i < length; i++) {
+            result.append(arr[i]);
+        }
+        U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
+        result.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
+    }
+
+    void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
+        U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
+        str.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
+    }
+}
+
+int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+                                       UVector32 &foundBreaks, const UnicodeString &inString,
+                                       const LocalPointer<UVector32> &inputMap,
+                                       UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    if (rangeStart >= rangeEnd) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    UVector32 boundary(inString.countChar32() + 1, status);
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    int32_t numBreaks = 0;
+    UChar32 ch;
+    UnicodeString index;
+    // The ML model groups six char to evaluate if the 4th char is a breakpoint.
+    // Like a sliding window, the elementList removes the first char and appends the new char from
+    // inString in each iteration so that its size always remains at six.
+    Element elementList[6];
+
+    int32_t codeUts = initElementList(inString, elementList, status);
+    int32_t length = inString.countChar32();
+
+    // Add a break for the start.
+    boundary.addElement(0, status);
+    numBreaks++;
+    if (U_FAILURE(status)) return 0;
+
+    for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
+        evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
+        if (i + 1 >= inString.countChar32()) break;
+        // Remove the first element and append a new element
+        uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
+        ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
+        index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
+        elementList[5].setCharAndUblock(ch, index);
+        if (ch != INVALID) {
+            codeUts += U16_LENGTH(ch);
+        }
+    }
+    if (U_FAILURE(status)) return 0;
+
+    // Add a break for the end if there is not one there already.
+    if (boundary.lastElementi() != inString.countChar32()) {
+        boundary.addElement(inString.countChar32(), status);
+        numBreaks++;
+    }
+
+    int32_t prevCPPos = -1;
+    int32_t prevUTextPos = -1;
+    int32_t correctedNumBreaks = 0;
+    for (int32_t i = 0; i < numBreaks; i++) {
+        int32_t cpPos = boundary.elementAti(i);
+        int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
+        U_ASSERT(cpPos > prevCPPos);
+        U_ASSERT(utextPos >= prevUTextPos);
+
+        if (utextPos > prevUTextPos) {
+            if (utextPos != rangeStart ||
+                (utextPos > 0 &&
+                 fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
+        } else {
+            // Normalization expanded the input text, the dictionary found a boundary
+            // within the expansion, giving two boundaries with the same index in the
+            // original text. Ignore the second. See ticket #12918.
+            --numBreaks;
+        }
+        prevCPPos = cpPos;
+        prevUTextPos = utextPos;
+    }
+    (void)prevCPPos;  // suppress compiler warnings about unused variable
+
+    UChar32 nextChar = utext_char32At(inText, rangeEnd);
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        // In phrase breaking, there has to be a breakpoint between Cj character and
+        // the number/open punctuation.
+        // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+        // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
+        // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
+        if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+            foundBreaks.popi();
+            correctedNumBreaks--;
+        }
+    }
+
+    return correctedNumBreaks;
+}
+
+void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+                                         UVector32 &boundary, UErrorCode &status) const {
+    char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    UChar32 arr[4] = {-1, -1, -1, -1};
+    int32_t length = 0, listLength = 0;
+
+    const UChar32 w1 = elementList[0].getCharacter();
+    const UChar32 w2 = elementList[1].getCharacter();
+    const UChar32 w3 = elementList[2].getCharacter();
+    const UChar32 w4 = elementList[3].getCharacter();
+    const UChar32 w5 = elementList[4].getCharacter();
+    const UChar32 w6 = elementList[5].getCharacter();
+
+    length = 1;
+    if (w1 != INVALID) {
+        arr[0] = w1;
+        concatChar(u"UW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w2 != INVALID) {
+        arr[0] = w2;
+        concatChar(u"UW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID) {
+        arr[0] = w3;
+        concatChar(u"UW3:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID) {
+        arr[0] = w4;
+        concatChar(u"UW4:", arr, length, featureList[listLength++], status);
+    }
+    if (w5 != INVALID) {
+        arr[0] = w5;
+        concatChar(u"UW5:", arr, length, featureList[listLength++], status);
+    }
+    if (w6 != INVALID) {
+        arr[0] = w6;
+        concatChar(u"UW6:", arr, length, featureList[listLength++], status);
+    }
+    length = 2;
+    if (w2 != INVALID && w3 != INVALID) {
+        arr[0] = w2;
+        arr[1] = w3;
+        concatChar(u"BW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID && w4 != INVALID) {
+        arr[0] = w3;
+        arr[1] = w4;
+        concatChar(u"BW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID && w5 != INVALID) {
+        arr[0] = w4;
+        arr[1] = w5;
+        concatChar(u"BW3:", arr, length, featureList[listLength++], status);
+    }
+    length = 3;
+    if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
+        arr[0] = w1;
+        arr[1] = w2;
+        arr[2] = w3;
+        concatChar(u"TW1:", arr, length, featureList[listLength++], status);
+    }
+    if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
+        arr[0] = w2;
+        arr[1] = w3;
+        arr[2] = w4;
+        concatChar(u"TW2:", arr, length, featureList[listLength++], status);
+    }
+    if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
+        arr[0] = w3;
+        arr[1] = w4;
+        arr[2] = w5;
+        concatChar(u"TW3:", arr, length, featureList[listLength++], status);
+    }
+    if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
+        arr[0] = w4;
+        arr[1] = w5;
+        arr[2] = w6;
+        concatChar(u"TW4:", arr, length, featureList[listLength++], status);
+    }
+    if (isValid(elementList[0])) {
+        writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
+                                                  elementList[0].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1])) {
+        writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
+                                                  elementList[1].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2])) {
+        writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
+                                                  elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3])) {
+        writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
+                                                  elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[4])) {
+        writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
+                                                  elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[5])) {
+        writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
+                                                  elementList[5].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1]) && isValid(elementList[2])) {
+        writeString(UnicodeString(u"BB1:")
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2]) && isValid(elementList[3])) {
+        writeString(UnicodeString(u"BB2:")
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3]) && isValid(elementList[4])) {
+        writeString(UnicodeString(u"BB3:")
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
+        writeString(UnicodeString(u"TB1:")
+                        .append(elementList[0].getUblock(), 0, elementList[0].getLength())
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
+        writeString(UnicodeString(u"TB2:")
+                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
+        writeString(UnicodeString(u"TB3:")
+                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
+        writeString(UnicodeString(u"TB4:")
+                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
+                        .append(elementList[4].getUblock(), 0, elementList[4].getLength())
+                        .append(elementList[5].getUblock(), 0, elementList[5].getLength()),
+                    featureList[listLength++], status);
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
+    int32_t score = fNegativeSum;
+    for (int32_t j = 0; j < listLength; j++) {
+        UnicodeString key(featureList[j]);
+        if (fModel.containsKey(key)) {
+            score += (2 * fModel.geti(key));
+        }
+    }
+    if (score > 0) {
+        boundary.addElement(index, status);
+        numBreaks++;
+    }
+}
+
+int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
+                                         UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    int32_t index = 0;
+    int32_t length = inString.countChar32();
+    UChar32 w1, w2, w3, w4, w5, w6;
+    w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+    if (length > 0) {
+        w3 = inString.char32At(0);
+        index += U16_LENGTH(w3);
+    }
+    if (length > 1) {
+        w4 = inString.char32At(index);
+        index += U16_LENGTH(w4);
+    }
+    if (length > 2) {
+        w5 = inString.char32At(index);
+        index += U16_LENGTH(w5);
+    }
+    if (length > 3) {
+        w6 = inString.char32At(index);
+        index += U16_LENGTH(w6);
+    }
+
+    const UnicodeString b1(INVALID);
+    const UnicodeString b2(b1);
+    const UnicodeString b3(getUnicodeBlock(w3, status));
+    const UnicodeString b4(getUnicodeBlock(w4, status));
+    const UnicodeString b5(getUnicodeBlock(w5, status));
+    const UnicodeString b6(getUnicodeBlock(w6, status));
+
+    elementList[0].setCharAndUblock(w1, b1);
+    elementList[1].setCharAndUblock(w2, b2);
+    elementList[2].setCharAndUblock(w3, b3);
+    elementList[3].setCharAndUblock(w4, b4);
+    elementList[4].setCharAndUblock(w5, b5);
+    elementList[5].setCharAndUblock(w6, b6);
+
+    return index;
+}
+
+UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return UnicodeString(INVALID);
+    }
+
+    UBlockCode block = ublock_getCode(ch);
+    if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
+        return UnicodeString(INVALID);
+    } else {
+        UnicodeString empty;
+        // Same as sprintf("%03d", block)
+        return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
+    }
+}
+
+void MlBreakEngine::loadMLModel(UErrorCode &error) {
+    // BudouX's model consists of pairs of the feature and its score.
+    // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
+    // corresponding feature's score.
+
+    if (U_FAILURE(error)) return;
+
+    int32_t keySize = 0;
+    int32_t valueSize = 0;
+    int32_t stringLength = 0;
+    UnicodeString key;
+    StackUResourceBundle stackTempBundle;
+    ResourceDataValue modelKey;
+
+    LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
+    UResourceBundle* rb = rbp.orphan();
+    // get modelValues
+    LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
+    const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
+    if (U_FAILURE(error)) return;
+
+    // get modelKeys
+    ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
+    ResourceArray stringArray = modelKey.getArray(error);
+    keySize = stringArray.getSize();
+    if (U_FAILURE(error)) return;
+
+    for (int32_t idx = 0; idx < keySize; idx++) {
+        stringArray.getValue(idx, modelKey);
+        key = UnicodeString(modelKey.getString(stringLength, error));
+        if (U_SUCCESS(error)) {
+            U_ASSERT(idx < valueSize);
+            fNegativeSum -= value[idx];
+            fModel.puti(key, value[idx], error);
+        }
+    }
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h

new file mode 100644 (file)

index 0000000..8943fa3
--- /dev/null
+++ b/icu4c/source/common/mlbe.h
@@ -0,0 +1,152 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+#ifndef MLBREAKENGINE_H
+#define MLBREAKENGINE_H
+
+#include "hash.h"
+#include "unicode/uniset.h"
+#include "unicode/utext.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+/**
+ * A class used to encapsulate a character and its unicode block index
+ */
+class Element : public UMemory {
+   public:
+    /**
+     * Default constructor.
+     */
+    Element();
+
+    /**
+     * Set the character and its unicode block.
+     *
+     * @param ch A unicode character.
+     * @param ublock The unicode block of the character.
+     */
+    void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
+
+    /**
+     * Get the unicode character.
+     *
+     * @return The unicode character.
+     */
+    UChar32 getCharacter() const;
+
+    /**
+     * Get the unicode character's unicode block.
+     *
+     * @return The unicode block.
+     */
+    char16_t* getUblock() const;
+
+    /**
+     * Get the length of the unicode block.
+     *
+     * @return The unicode block length.
+     */
+    uint16_t getLength() const;
+
+   private:
+    UChar32 character;
+    char16_t ublock[4];
+    uint16_t length;
+};
+
+/**
+ * A machine learning break engine for the phrase breaking in Japanese.
+ */
+class MlBreakEngine : public UMemory {
+   public:
+    /**
+     * Constructor.
+     *
+     * @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
+     * alphabet.
+     * @param closePunctuationSet An UnicodeSet with close punctuation.
+     * @param status Information on any errors encountered.
+     */
+    MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
+                    const UnicodeSet &closePunctuationSet, UErrorCode &status);
+
+    /**
+     * Virtual destructor.
+     */
+    virtual ~MlBreakEngine();
+
+   public:
+    /**
+     * Divide up a range of characters handled by this break engine.
+     *
+     * @param inText A UText representing the text
+     * @param rangeStart The start of the range of the characters
+     * @param rangeEnd The end of the range of the characters
+     * @param foundBreaks Output of C array of int32_t break positions, or 0
+     * @param inString The normalized string of text ranging from rangeStart to rangeEnd
+     * @param inputMap The vector storing the native index of inText
+     * @param status Information on any errors encountered.
+     * @return The number of breaks found
+     */
+    int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
+                          UVector32 &foundBreaks, const UnicodeString &inString,
+                          const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
+
+   private:
+    /**
+     * Load the machine learning's model file.
+     *
+     * @param error Information on any errors encountered.
+     */
+    void loadMLModel(UErrorCode &error);
+
+    /**
+     * Get the character's unicode block code defined in UBlockCode.
+     *
+     * @param ch A character.
+     * @param error Information on any errors encountered.
+     * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
+     * is less than 3 digits.
+     *
+     */
+    UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
+
+    /**
+     * Initialize the element list from the input string.
+     *
+     * @param inString A input string to be segmented.
+     * @param elementList A list to store the first six characters and their unicode block codes.
+     * @param status Information on any errors encountered.
+     * @return The number of code units of the first six characters in inString.
+     */
+    int32_t initElementList(const UnicodeString &inString, Element* elementList,
+                            UErrorCode &status) const;
+
+    /**
+     * Evaluate whether the index is a potential breakpoint.
+     *
+     * @param elementList A list including 6 elements for the breakpoint evaluation.
+     * @param index The breakpoint index to be evaluated.
+     * @param numBreaks The accumulated number of breakpoints.
+     * @param boundary A vector including the index of the breakpoint.
+     * @param status Information on any errors encountered.
+     */
+    void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+                            UVector32 &boundary, UErrorCode &status) const;
+
+    UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+    UnicodeSet fClosePunctuationSet;
+    Hashtable fModel;
+    int32_t fNegativeSum;
+};
+
+#endif
+
+U_NAMESPACE_END
+
+/* MLBREAKENGINE_H */
+#endif
diff --git a/icu4c/source/common/sources.txt b/icu4c/source/common/sources.txt

index e5c39dd2ce3c3537a388d0a640df3097bedf3a16..90171fe9bd4b9aa84dcd1d628636382cbec1c90e 100644 (file)
--- a/icu4c/source/common/sources.txt
+++ b/icu4c/source/common/sources.txt
@@ -43,6 +43,7 @@ locutil.cpp
  lsr.cpp
  lstmbe.cpp
  messagepattern.cpp
+mlbe.cpp
  normalizer2.cpp
  normalizer2impl.cpp
  normlzr.cpp
diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h

index bbc232d1ed8fdf94ed8b6f4e5ff9c3aeea7a5e88..3818ca02ef85d2a90f1ce0a52fdb8bc7f01f5fc6 100644 (file)
--- a/icu4c/source/common/unicode/uconfig.h
+++ b/icu4c/source/common/unicode/uconfig.h
@@ -323,6 +323,16 @@
  #   define UCONFIG_NO_NORMALIZATION 0
  #endif
  
+/**
+ * \def UCONFIG_USE_ML_PHRASE_BREAKING
+ * This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
+ *
+ * @internal
+ */
+#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
+#   define UCONFIG_USE_ML_PHRASE_BREAKING 0
+#endif
+
  #if UCONFIG_NO_NORMALIZATION
      /* common library */
      /* ICU 50 CJK dictionary BreakIterator uses normalization */
diff --git a/icu4c/source/data/BUILDRULES.py b/icu4c/source/data/BUILDRULES.py

index 899cba25b48067330d7b14d05c8bb893a9358322..2608cb0227b4761f9f4a9bdc12f768d0cea40c39 100644 (file)
--- a/icu4c/source/data/BUILDRULES.py
+++ b/icu4c/source/data/BUILDRULES.py
@@ -27,6 +27,7 @@ def generate(config, io, common_vars):
      requests += generate_conversion_mappings(config, io, common_vars)
      requests += generate_brkitr_brk(config, io, common_vars)
      requests += generate_brkitr_lstm(config, io, common_vars)
+    requests += generate_brkitr_adaboost(config, io, common_vars)
      requests += generate_stringprep(config, io, common_vars)
      requests += generate_brkitr_dictionaries(config, io, common_vars)
      requests += generate_normalization(config, io, common_vars)
@@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars):
              category = "brkitr_rules",
              dep_targets =
                  [DepTarget("cnvalias"),
-                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
+                    DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
              input_files = input_files,
              output_files = output_files,
              tool = IcuTool("genbrk"),
@@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars):
          )
      ]
  
+def generate_brkitr_adaboost(config, io, common_vars):
+    input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
+    input_basenames = [v.filename[16:] for v in input_files]
+    output_files = [
+        OutFile("brkitr/%s.res" % v[:-4])
+        for v in input_basenames
+    ]
+    return [
+        RepeatedOrSingleExecutionRequest(
+            name = "adaboost_res",
+            category = "brkitr_adaboost",
+            dep_targets = [],
+            input_files = input_files,
+            output_files = output_files,
+            tool = IcuTool("genrb"),
+            args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
+                "-k "
+                "{INPUT_BASENAME}",
+            format_with = {
+            },
+            repeat_with = {
+                "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
+            }
+        )
+    ]
+
  def generate_tree(
          config,
          io,
diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt

new file mode 100644 (file)

index 0000000..0500ff7
--- /dev/null
+++ b/icu4c/source/data/brkitr/adaboost/jaml.txt
@@ -0,0 +1,940 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+jaml {
+    modelKeys {
+        "BB2:062071",
+        "UB3:061",
+        "UB3:071",
+        "TB2:062062062",
+        "TB4:062062062",
+        "UB3:063",
+        "UB4:071",
+        "BB3:062062",
+        "UB4:062",
+        "BB1:062071",
+        "BB1:062061",
+        "UB4:061",
+        "TB1:071071062",
+        "TB3:062063063",
+        "UB2:061",
+        "TB1:062071062",
+        "TB3:062062062",
+        "BB2:063063",
+        "UW3:は",
+        "UW3:に",
+        "TB3:062071062",
+        "UW3:が",
+        "UW4:こ",
+        "UB5:061",
+        "UW3:と",
+        "TB4:063063063",
+        "UW4:て",
+        "TB2:062062061",
+        "UW3:。",
+        "UW4:お",
+        "UW3:の",
+        "BB3:071071",
+        "BB3:062071",
+        "UW3:お",
+        "UW3:し",
+        "UW4:、",
+        "UW4:の",
+        "UW3:を",
+        "UW4:。",
+        "UW3:、",
+        "UW5:で",
+        "UW4:あ",
+        "BB2:062062",
+        "UW4:っ",
+        "UW5:っ",
+        "UW3:も",
+        "UW5:う",
+        "UW3:「",
+        "UW5:な",
+        "UW4:そ",
+        "UW4:る",
+        "UW3:っ",
+        "UW4:「",
+        "UW4:い",
+        "BB2:087087",
+        "UB4:087",
+        "UW5:に",
+        "BW3:もの",
+        "UW5:し",
+        "UW6:う",
+        "BW2:とい",
+        "UW4:に",
+        "UW3:る",
+        "TB2:071062071",
+        "UW4:で",
+        "UW5:が",
+        "BB1:071071",
+        "UW5:は",
+        "UW4:は",
+        "UW4:れ",
+        "UW5:き",
+        "BB2:071062",
+        "BB2:071071",
+        "UW3:・",
+        "BB2:071087",
+        "BB2:061062",
+        "TB1:062061062",
+        "UW3:れ",
+        "BB2:087062",
+        "TB2:087087087",
+        "UW4:ら",
+        "TB1:071071071",
+        "UB2:071",
+        "TB1:062062087",
+        "UW5:す",
+        "UW5:ん",
+        "UW3:で",
+        "UW4:が",
+        "UW3:こ",
+        "TB4:071062062",
+        "UW3:ら",
+        "UW6:に",
+        "UW6:。",
+        "UW3:た",
+        "TB1:061071071",
+        "UW5:く",
+        "UB1:063",
+        "UW1:そ",
+        "UW3:う",
+        "BW3:とい",
+        "BW3:とこ",
+        "UW3:ま",
+        "BW3:こと",
+        "UW2:っ",
+        "UW5:・",
+        "TB3:062062061",
+        "UW3:き",
+        "UW4:ん",
+        "UB3:062",
+        "UW3:く",
+        "UW3:」",
+        "UW5:あ",
+        "BB2:062087",
+        "BW3:いう",
+        "UW5:れ",
+        "UW2:一",
+        "UW3:，",
+        "UW1:に",
+        "UW2:と",
+        "TB2:071071062",
+        "TB2:071071071",
+        "UW5:を",
+        "UW4:り",
+        "BW1:から",
+        "UW3:ち",
+        "BW3:いい",
+        "UW2:は",
+        "UW6:た",
+        "TB1:063063062",
+        "UW4:１",
+        "UW4:や",
+        "UW2:ん",
+        "UW3:］",
+        "UW4:ほ",
+        "TB3:062087087",
+        "BW2:であ",
+        "UW4:だ",
+        "BB3:071062",
+        "TB1:087087087",
+        "BW3:・・",
+        "BW3:とき",
+        "UW4:を",
+        "UW3:て",
+        "UW4:か",
+        "UW2:そ",
+        "TB4:071071062",
+        "TB2:062061071",
+        "UW2:を",
+        "UW4:ご",
+        "UW2:で",
+        "TB3:071071071",
+        "BB1:087087",
+        "UW2:し",
+        "UW4:出",
+        "UW2:ま",
+        "UW4:，",
+        "UW5:と",
+        "UW4:ど",
+        "BW3:して",
+        "UW1:で",
+        "BB2:061071",
+        "BW3:ため",
+        "BW2:とし",
+        "BW2:ない",
+        "BW2:てい",
+        "UW3:間",
+        "UW3:！",
+        "UW5:ー",
+        "UW4:す",
+        "UW4:！",
+        "BW1:とが",
+        "UW5:の",
+        "TB4:062062071",
+        "TB2:061071071",
+        "UW6:・",
+        "UW3:．",
+        "UW2:て",
+        "UW3:笑",
+        "UW2:こ",
+        "UW5:も",
+        "BW3:よう",
+        "UW3:人",
+        "UW2:の",
+        "UW3:か",
+        "UW3:日",
+        "UW1:い",
+        "BW2:とこ",
+        "UW4:私",
+        "UW3:…",
+        "UW2:に",
+        "UW3:今",
+        "BB3:087062",
+        "UB3:055",
+        "UW4:（",
+        "BB1:087071",
+        "UW1:な",
+        "BB3:063063",
+        "UW5:来",
+        "UW3:？",
+        "TW3:ている",
+        "UW4:」",
+        "UW4:前",
+        "BW1:いう",
+        "UW4:つ",
+        "UW3:）",
+        "BW1:では",
+        "UW2:る",
+        "UW5:そ",
+        "UW4:ー",
+        "TW2:気に入",
+        "UW4:笑",
+        "UW4:ひ",
+        "TB4:087087087",
+        "UW4:け",
+        "UW2:も",
+        "BW3:ちょ",
+        "BW3:出来",
+        "TB2:062071062",
+        "UW4:『",
+        "UW3:［",
+        "UW4:２",
+        "UW5:つ",
+        "TB1:061071062",
+        "UW3:１",
+        "BW3:から",
+        "UB5:071",
+        "UW4:ま",
+        "UW3:ば",
+        "UW3:り",
+        "BW3:その",
+        "UW3:ご",
+        "UW4:わ",
+        "BW2:てお",
+        "TB2:071062062",
+        "BW1:ない",
+        "UW2:よ",
+        "UB2:087",
+        "UW6:の",
+        "UW2:毎",
+        "UW2:結",
+        "TW4:の京都",
+        "UW3:さ",
+        "UW2:最",
+        "BW2:です",
+        "UW2:」",
+        "UW5:え",
+        "UW3:だ",
+        "TW4:ところ",
+        "UW4:．",
+        "UB1:062",
+        "UW6:て",
+        "UW1:が",
+        "BW2:、と",
+        "UW3:０",
+        "UW3:ん",
+        "UW3:中",
+        "UW4:よ",
+        "BW3:この",
+        "UW2:が",
+        "UW3:み",
+        "TW2:ではな",
+        "UW6:と",
+        "UW4:［",
+        "TW3:、ある",
+        "BW3:ころ",
+        "UW4:？",
+        "UW6:、",
+        "UW4:電",
+        "BB1:062040",
+        "UW3:後",
+        "UW5:い",
+        "UW2:、",
+        "UW5:て",
+        "BB2:062040",
+        "UW3:真",
+        "UW3:そ",
+        "UW5:さ",
+        "UB5:087",
+        "TW3:という",
+        "UW3:分",
+        "UB6:071",
+        "BW3:なっ",
+        "UW4:ろ",
+        "BB2:061061",
+        "TW3:ところ",
+        "UB1:071",
+        "UW1:、",
+        "BW1:とか",
+        "UW3:な",
+        "UW6:り",
+        "UW4:間",
+        "UW3:べ",
+        "UW5:べ",
+        "TB4:062071062",
+        "UW4:］",
+        "BW2:には",
+        "UW5:々",
+        "BW1:。・",
+        "BW1:その",
+        "UW1:す",
+        "UW4:）",
+        "UW6:っ",
+        "TB3:063063063",
+        "TB3:062071071",
+        "UB5:063",
+        "BW1:かも",
+        "UW6:る",
+        "TB4:062063063",
+        "UW3:ど",
+        "TW3:である",
+        "TW4:くらい",
+        "BW1:最近",
+        "BW1:しい",
+        "BW1:とも",
+        "BW2:と同",
+        "TW1:という",
+        "UW2:さ",
+        "BW2:帯電",
+        "TB1:071062062",
+        "BW3:そし",
+        "UW2:。",
+        "UW5:か",
+        "UW5:こ",
+        "BW3:ない",
+        "BW1:んな",
+        "BW2:でき",
+        "UW4:３",
+        "UW3:け",
+        "TW4:ことが",
+        "BW1:こと",
+        "UB3:087",
+        "UW3:電",
+        "UW3:よ",
+        "BW1:たと",
+        "UW5:ま",
+        "UW5:た",
+        "UW5:ち",
+        "UW2:け",
+        "UW5:だ",
+        "UW3:度",
+        "BW1:たい",
+        "UW4:使",
+        "UW2:き",
+        "TW4:かなり",
+        "UB6:063",
+        "BB1:062062",
+        "UW4:込",
+        "TW3:と言っ",
+        "UW6:だ",
+        "UW5:り",
+        "UW5:よ",
+        "BW3:どう",
+        "UW4:…",
+        "UW3:や",
+        "BW1:かし",
+        "BW3:かっ",
+        "UW4:今",
+        "UW3:『",
+        "UW4:思",
+        "UB2:063",
+        "UW4:く",
+        "UW3:京",
+        "UW6:ー",
+        "UW1:ん",
+        "BW1:うな",
+        "TB2:062061061",
+        "UW1:と",
+        "TB4:062063062",
+        "TB2:061062062",
+        "BW1:この",
+        "BW2:ので",
+        "UW4:み",
+        "UW5:わ",
+        "UW6:や",
+        "BW1:れて",
+        "UW2:や",
+        "UW6:こ",
+        "UW4:な",
+        "UW5:め",
+        "BW1:もう",
+        "TB4:071062071",
+        "BW1:より",
+        "UW4:合",
+        "UW6:け",
+        "BW1:少し",
+        "BW2:でし",
+        "UW4:と",
+        "TB1:063063063",
+        "UW3:ー",
+        "BW2:くな",
+        "UW2:く",
+        "UW2:我",
+        "BW2:いも",
+        "BW3:わか",
+        "TB2:071063071",
+        "UW4:も",
+        "UW1:あ",
+        "UW4:最",
+        "BW1:るの",
+        "UW2:全",
+        "UW6:０",
+        "UW4:放",
+        "UW4:京",
+        "BW3:かけ",
+        "UW2:少",
+        "BW3:もう",
+        "UW2:多",
+        "UW2:う",
+        "TB1:062062040",
+        "UW1:を",
+        "UW3:光",
+        "BW1:！！",
+        "UW2:ャ",
+        "BW3:すぐ",
+        "UW4:帯",
+        "UW6:し",
+        "BW3:でも",
+        "BW2:、そ",
+        "TB3:071087087",
+        "TB2:063062071",
+        "UW3:わ",
+        "UB4:063",
+        "TB4:071071071",
+        "UW5:都",
+        "UW5:ず",
+        "UW2:バ",
+        "UW2:京",
+        "UW3:ゃ",
+        "BW1:い、",
+        "BW3:よく",
+        "BW1:たら",
+        "BW2:のよ",
+        "UW2:思",
+        "BW1:うに",
+        "BW1:の間",
+        "UW6:ん",
+        "UW6:ず",
+        "BW1:った",
+        "TW3:ること",
+        "BW3:とて",
+        "TW1:ような",
+        "UW6:ぱ",
+        "TB3:063071062",
+        "TW4:って、",
+        "TW4:なんて",
+        "TW2:その後",
+        "UW6:ら",
+        "TW4:ことに",
+        "UW3:＞",
+        "TW3:てしま",
+        "UW3:い",
+        "TB4:071062061",
+        "UW2:ひ",
+        "UW6:め",
+        "UW6:で",
+        "BW3:なる",
+        "UW5:ご",
+        "BW2:りし",
+        "UW6:電",
+        "UW1:は",
+        "BW1:いも",
+        "BW3:すご",
+        "UW4:通",
+        "BW3:おり",
+        "BW3:かか",
+        "BW1:思い",
+    }
+    modelValues:intvector {
+        1800,
+        271,
+        -857,
+        -417,
+        285,
+        -583,
+        388,
+        828,
+        -853,
+        -820,
+        502,
+        -708,
+        358,
+        1341,
+        -586,
+        -451,
+        257,
+        -1876,
+        2052,
+        1698,
+        -458,
+        2048,
+        1182,
+        -551,
+        980,
+        773,
+        -1453,
+        -152,
+        3201,
+        2865,
+        1203,
+        144,
+        -369,
+        -2539,
+        -613,
+        -3574,
+        -1111,
+        3110,
+        -3022,
+        2039,
+        -1091,
+        1241,
+        -560,
+        -1412,
+        625,
+        1350,
+        297,
+        -2404,
+        -595,
+        1007,
+        -1829,
+        -1662,
+        3213,
+        270,
+        -911,
+        178,
+        -727,
+        2716,
+        -484,
+        -344,
+        929,
+        -1236,
+        760,
+        -299,
+        -419,
+        -728,
+        122,
+        -704,
+        -605,
+        -1507,
+        545,
+        -68,
+        -320,
+        1498,
+        953,
+        -323,
+        -575,
+        -673,
+        520,
+        -450,
+        -1767,
+        -247,
+        56,
+        231,
+        -764,
+        536,
+        794,
+        -703,
+        -566,
+        51,
+        390,
+        52,
+        -182,
+        466,
+        133,
+        354,
+        107,
+        492,
+        488,
+        -1194,
+        1145,
+        -847,
+        812,
+        151,
+        -517,
+        -314,
+        -553,
+        -783,
+        -117,
+        736,
+        -88,
+        -598,
+        569,
+        606,
+        287,
+        744,
+        1739,
+        -217,
+        -219,
+        -144,
+        234,
+        -649,
+        -757,
+        834,
+        -819,
+        869,
+        -275,
+        -267,
+        154,
+        653,
+        594,
+        255,
+        1018,
+        1124,
+        284,
+        -1624,
+        -372,
+        440,
+        -184,
+        -1936,
+        1318,
+        -1124,
+        453,
+        -92,
+        -343,
+        175,
+        182,
+        -886,
+        930,
+        -223,
+        -57,
+        -113,
+        103,
+        -200,
+        510,
+        -2099,
+        -498,
+        385,
+        80,
+        -156,
+        360,
+        1289,
+        771,
+        -1114,
+        -399,
+        870,
+        1230,
+        79,
+        472,
+        -1596,
+        -1092,
+        -572,
+        55,
+        -151,
+        -124,
+        1316,
+        -248,
+        1280,
+        -125,
+        -284,
+        -1023,
+        862,
+        84,
+        417,
+        568,
+        -88,
+        -528,
+        910,
+        674,
+        -212,
+        894,
+        -121,
+        1108,
+        762,
+        260,
+        -197,
+        91,
+        -53,
+        1117,
+        -645,
+        -868,
+        -611,
+        220,
+        422,
+        1431,
+        -532,
+        -157,
+        -476,
+        -846,
+        -1309,
+        -1614,
+        1225,
+        302,
+        -738,
+        -260,
+        892,
+        -778,
+        -193,
+        1221,
+        -779,
+        489,
+        420,
+        -85,
+        -525,
+        -830,
+        26,
+        270,
+        439,
+        -120,
+        1263,
+        -795,
+        291,
+        -1310,
+        -23,
+        347,
+        312,
+        -107,
+        -114,
+        701,
+        830,
+        1309,
+        -451,
+        260,
+        -1080,
+        536,
+        188,
+        -60,
+        643,
+        -1184,
+        31,
+        -194,
+        -51,
+        -514,
+        -442,
+        -120,
+        649,
+        410,
+        882,
+        -75,
+        -341,
+        -718,
+        -128,
+        340,
+        -1245,
+        -164,
+        -1052,
+        70,
+        -256,
+        279,
+        786,
+        40,
+        -177,
+        97,
+        -411,
+        222,
+        -89,
+        -277,
+        -146,
+        414,
+        483,
+        21,
+        -339,
+        -406,
+        -360,
+        -450,
+        -14,
+        -36,
+        513,
+        252,
+        54,
+        -501,
+        -478,
+        450,
+        -36,
+        -644,
+        -392,
+        714,
+        643,
+        -341,
+        91,
+        -1018,
+        34,
+        -177,
+        123,
+        80,
+        -695,
+        -44,
+        -357,
+        253,
+        -389,
+        613,
+        515,
+        418,
+        -396,
+        -553,
+        193,
+        298,
+        -334,
+        -57,
+        -315,
+        -77,
+        33,
+        88,
+        137,
+        280,
+        -448,
+        196,
+        -136,
+        -295,
+        -329,
+        -92,
+        -360,
+        -132,
+        -288,
+        -45,
+        -43,
+        174,
+        75,
+        -60,
+        330,
+        360,
+        217,
+        130,
+        473,
+        -41,
+        -23,
+        -340,
+        -530,
+        -69,
+        -71,
+        -115,
+        297,
+        -240,
+        229,
+        507,
+        -348,
+        171,
+        -320,
+        239,
+        16,
+        -195,
+        -277,
+        -41,
+        69,
+        280,
+        -264,
+        30,
+        249,
+        -97,
+        -163,
+        -221,
+        96,
+        83,
+        82,
+        -218,
+        -93,
+        -53,
+        40,
+        28,
+        285,
+        27,
+        283,
+        -211,
+        -92,
+        214,
+        -225,
+        -54,
+        53,
+        105,
+        -198,
+        -53,
+        -277,
+        198,
+        184,
+        -264,
+        -106,
+        14,
+        185,
+        -155,
+        185,
+        106,
+        -119,
+        53,
+        208,
+        92,
+        262,
+        106,
+        -52,
+        105,
+        -25,
+        -79,
+        104,
+        141,
+        129,
+        -114,
+        26,
+        64,
+        -113,
+        26,
+        77,
+        -64,
+        13,
+        13,
+        26,
+        89,
+        115,
+        -49,
+        89,
+        -114,
+        51,
+        64,
+        -64,
+        -51,
+        -38,
+        89,
+        13,
+        -64,
+        13,
+        -48,
+        76,
+        63,
+        62,
+        13,
+        112,
+        -76,
+        -50,
+        -13,
+        -49,
+        63,
+        -50,
+        13,
+        13,
+        -50,
+        24,
+        -12,
+        24,
+        12,
+        24,
+        12,
+        -12,
+        -24,
+        12,
+        -12,
+        -12,
+        12,
+        -12,
+    }
+}
+\ No newline at end of file
diff --git a/icu4c/source/python/icutools/databuilder/filtration.py b/icu4c/source/python/icutools/databuilder/filtration.py

index 27d08b0a7720cf13789ac227a95ec79db660b835..e9339a08955ceaf9ec1124d4c50a41707977c212 100644 (file)
--- a/icu4c/source/python/icutools/databuilder/filtration.py
+++ b/icu4c/source/python/icutools/databuilder/filtration.py
@@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io):
      default_filter_json = "exclude" if config.strategy == "additive" else "include"
      for category in all_categories:
          filter_json = default_filter_json
-        # Special default for category "brkitr_lstm" as "exclude" for now.
-        if "brkitr_lstm" == category:
+        # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
+        if "brkitr_lstm" == category or "brkitr_adaboost" == category:
              filter_json = "exclude"
          # Figure out the correct filter to create for now.
          if "featureFilters" in json_data and category in json_data["featureFilters"]:
diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt

index 9676ed4856d7ce56523ab4330f28d3e7ed5e4ca9..7460caa7cddf9ae887e964e88fbd8612db7fae98 100644 (file)
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -211,7 +211,7 @@ group: breakiterator
      brkiter.o brkeng.o ubrk.o
      rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
      rbbidata.o rbbirb.o rbbi_cache.o
-    dictionarydata.o dictbe.o lstmbe.o
+    dictionarydata.o dictbe.o lstmbe.o mlbe.o
      # BreakIterator::makeInstance() factory implementation makes for circular dependency
      # between BreakIterator base and FilteredBreakIteratorBuilder.
      filteredbrk.o
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp

index 17c05fb0d44f4c31622344ea27cd37422f880697..7afdb9ab8280d19b456b9fe577bb4b7229b64f0a 100644 (file)
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -42,6 +42,7 @@
  #include "charstr.h"
  #include "cmemory.h"
  #include "cstr.h"
+#include "cstring.h"
  #include "intltest.h"
  #include "lstmbe.h"
  #include "rbbitst.h"
@@ -835,9 +836,28 @@ void RBBITest::TestExtended() {
                  delete tp.bi;
                  tp.bi = BreakIterator::createLineInstance(locale,  status);
                  skipTest = false;
+#if UCONFIG_USE_ML_PHRASE_BREAKING
+                if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+                    // skip <line> test cases of JP's phrase breaking when ML is enabled.
+                    skipTest = true;
+                }
+#endif
                  charIdx += 5;
                  break;
              }
+            if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
+                delete tp.bi;
+                tp.bi = BreakIterator::createLineInstance(locale,  status);
+                skipTest = false;
+#if !UCONFIG_USE_ML_PHRASE_BREAKING
+                if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
+                    // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
+                    skipTest = true;
+                }
+#endif
+                charIdx += 7;
+                break;
+            }
              if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createSentenceInstance(locale,  status);
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index 72bd15803d6854839c16517cde04725ff244e8c2..40c6745dd06666d0b0e3328f15e43a98172ab9af 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1913,6 +1913,26 @@ Bangkok)•</data>
  <data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
  
  <locale ja@lw=phrase>
+#phrase breaking test cases for the ML solution
+<lineML>
+#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
+#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
+#𛁈る𛀸（しるこ）、あ𛀙よろし（あかよろし） -> 𛁈る𛀸•（しるこ）、•あ𛀙よろし•（あかよろし）
+<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#中国の携帯は約５００元から５０００元です -> 中国の▁携帯は▁約▁５００元から▁５０００元です
+<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
+#しかもロゴがＵｎｉｃｏｄｅ！！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！！
+<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
+#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
+<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
+#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
+<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
+
+<locale ja@lw=phrase>
+#phrase breaking test cases for the dictionary based solution
  <line>
  #[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
  <data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
@@ -2005,8 +2025,8 @@ Bangkok)•</data>
  #大韓民國은 民主共和國이다
  #<data>•大韓民國은 •民主•共和國이다•</data>
  # All the tests for ja@lw=phrase should also work in Korean.
-#[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
-<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
  #９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
  <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
author	Shuhei Iitsuka <tushuhei@google.com>
	Fri, 29 Jul 2022 04:08:01 +0000 (12:08 +0800)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 2 Dec 2022 18:11:06 +0000 (10:11 -0800)
.github/adaboost.json	[new file with mode: 0644]	patch \| blob
.github/workflows/icu_ci.yml		patch \| blob \| history
icu4c/source/common/BUILD.bazel		patch \| blob \| history
icu4c/source/common/common.vcxproj		patch \| blob \| history
icu4c/source/common/common.vcxproj.filters		patch \| blob \| history
icu4c/source/common/common_uwp.vcxproj		patch \| blob \| history
icu4c/source/common/dictbe.cpp		patch \| blob \| history
icu4c/source/common/dictbe.h		patch \| blob \| history
icu4c/source/common/mlbe.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/common/mlbe.h	[new file with mode: 0644]	patch \| blob
icu4c/source/common/sources.txt		patch \| blob \| history
icu4c/source/common/unicode/uconfig.h		patch \| blob \| history
icu4c/source/data/BUILDRULES.py		patch \| blob \| history
icu4c/source/data/brkitr/adaboost/jaml.txt	[new file with mode: 0644]	patch \| blob
icu4c/source/python/icutools/databuilder/filtration.py		patch \| blob \| history
icu4c/source/test/depstest/dependencies.txt		patch \| blob \| history
icu4c/source/test/intltest/rbbitst.cpp		patch \| blob \| history
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history