ICU-22100 Modify ML model to improve Japanese phrase breaking performance

author allenwtsu <allenwtsu@google.com>

Tue, 31 Jan 2023 10:17:02 +0000 (18:17 +0800)

committer Frank Yung-Fong Tang <ftang@google.com>

Fri, 3 Feb 2023 21:07:53 +0000 (13:07 -0800)
author allenwtsu <allenwtsu@google.com>
Tue, 31 Jan 2023 10:17:02 +0000 (18:17 +0800)
committer Frank Yung-Fong Tang <ftang@google.com>
Fri, 3 Feb 2023 21:07:53 +0000 (13:07 -0800)
diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp

index 14f68d2a12623fd6c2bff9e57dd48d6f2b14756b..7e734f2c8adfbf70f6d2c2e8979bbca3a951b9ec 100644 (file)
--- a/icu4c/source/common/mlbe.cpp
+++ b/icu4c/source/common/mlbe.cpp
@@ -18,11 +18,12 @@
  
  U_NAMESPACE_BEGIN
  
+enum class ModelIndex { kUWStart = 0, kBWStart = 6, kTWStart = 9 };
+
  MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
-                                 const UnicodeSet &closePunctuationSet, UErrorCode &status)
+                             const UnicodeSet &closePunctuationSet, UErrorCode &status)
      : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
        fClosePunctuationSet(closePunctuationSet),
-      fModel(status),
        fNegativeSum(0) {
      if (U_FAILURE(status)) {
          return;
@@ -32,14 +33,10 @@ MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetS
  
  MlBreakEngine::~MlBreakEngine() {}
  
-namespace {
-    const char16_t INVALID = u'|';
-}
-
  int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
-                                       UVector32 &foundBreaks, const UnicodeString &inString,
-                                       const LocalPointer<UVector32> &inputMap,
-                                       UErrorCode &status) const {
+                                     UVector32 &foundBreaks, const UnicodeString &inString,
+                                     const LocalPointer<UVector32> &inputMap,
+                                     UErrorCode &status) const {
      if (U_FAILURE(status)) {
          return 0;
      }
@@ -53,30 +50,35 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
          return 0;
      }
      int32_t numBreaks = 0;
-    UnicodeString index;
-    // The ML model groups six char to evaluate if the 4th char is a breakpoint.
-    // Like a sliding window, the elementList removes the first char and appends the new char from
-    // inString in each iteration so that its size always remains at six.
-    UChar32 elementList[6];
-
-    int32_t codeUts = initElementList(inString, elementList, status);
-    int32_t length = inString.countChar32();
+    int32_t codePointLength = inString.countChar32();
+    // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
+    // In each iteration, it evaluates the 4th char and then moves forward one char like a sliding
+    // window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After
+    // moving forward, finally the last six values in the indexList are
+    // [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1".
+    int32_t indexSize = codePointLength + 4;
+    int32_t *indexList = (int32_t *)uprv_malloc(indexSize * sizeof(int32_t));
+    if (indexList == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return 0;
+    }
+    int32_t numCodeUnits = initIndexList(inString, indexList, status);
  
      // Add a break for the start.
      boundary.addElement(0, status);
      numBreaks++;
      if (U_FAILURE(status)) return 0;
  
-    for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
-        evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
-        if (i + 1 >= inString.countChar32()) break;
-        // Remove the first element and append a new element
-        uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32));
-        elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
-        if (elementList[5] != INVALID) {
-            codeUts += U16_LENGTH(elementList[5]);
+    for (int32_t idx = 0; idx + 1 < codePointLength && U_SUCCESS(status); idx++) {
+        numBreaks =
+            evaluateBreakpoint(inString, indexList, idx, numCodeUnits, numBreaks, boundary, status);
+        if (idx + 4 < codePointLength) {
+            indexList[idx + 6] = numCodeUnits;
+            numCodeUnits += U16_LENGTH(inString.char32At(indexList[idx + 6]));
          }
      }
+    uprv_free(indexList);
+
      if (U_FAILURE(status)) return 0;
  
      // Add a break for the end if there is not one there already.
@@ -128,119 +130,112 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
      return correctedNumBreaks;
  }
  
-void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
-                                         UVector32 &boundary, UErrorCode &status) const {
+int32_t MlBreakEngine::evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList,
+                                          int32_t startIdx, int32_t numCodeUnits, int32_t numBreaks,
+                                          UVector32 &boundary, UErrorCode &status) const {
      if (U_FAILURE(status)) {
-        return;
+        return numBreaks;
      }
-
-    UnicodeString feature;
+    int32_t start = 0, end = 0;
      int32_t score = fNegativeSum;
  
-    if (elementList[0] != INVALID) {
-        // When the key doesn't exist, Hashtable.geti(key) returns 0  and 2 * 0 = 0.
-        // So, we can skip to check whether fModel includes key featureList[j] or not.
-        score += (2 * fModel.geti(feature.setTo(u"UW1:", 4).append(elementList[0])));
-    }
-    if (elementList[1] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"UW2:", 4).append(elementList[1])));
-    }
-    if (elementList[2] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"UW3:", 4).append(elementList[2])));
-    }
-    if (elementList[3] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"UW4:", 4).append(elementList[3])));
-    }
-    if (elementList[4] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"UW5:", 4).append(elementList[4])));
-    }
-    if (elementList[5] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"UW6:", 4).append(elementList[5])));
-    }
-    if (elementList[1] != INVALID && elementList[2] != INVALID) {
-        score += (2 * fModel.geti(
-                          feature.setTo(u"BW1:", 4).append(elementList[1]).append(elementList[2])));
-    }
-    if (elementList[2] != INVALID && elementList[3] != INVALID) {
-        score += (2 * fModel.geti(
-                          feature.setTo(u"BW2:", 4).append(elementList[2]).append(elementList[3])));
-    }
-    if (elementList[3] != INVALID && elementList[4] != INVALID) {
-        score += (2 * fModel.geti(
-                          feature.setTo(u"BW3:", 4).append(elementList[3]).append(elementList[4])));
-    }
-    if (elementList[0] != INVALID && elementList[1] != INVALID && elementList[2] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"TW1:", 4)
-                                      .append(elementList[0])
-                                      .append(elementList[1])
-                                      .append(elementList[2])));
-    }
-    if (elementList[1] != INVALID && elementList[2] != INVALID && elementList[3] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"TW2:", 4)
-                                      .append(elementList[1])
-                                      .append(elementList[2])
-                                      .append(elementList[3])));
+    for (int i = 0; i < 6; i++) {
+        // UW1 ~ UW6
+        start = startIdx + i;
+        if (indexList[start] != -1) {
+            end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
+            score += fModel[static_cast<int32_t>(ModelIndex::kUWStart) + i].geti(
+                inString.tempSubString(indexList[start], end - indexList[start]));
+        }
      }
-    if (elementList[2] != INVALID && elementList[3] != INVALID && elementList[4] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"TW3:", 4)
-                                      .append(elementList[2])
-                                      .append(elementList[3])
-                                      .append(elementList[4])));
+    for (int i = 0; i < 3; i++) {
+        // BW1 ~ BW3
+        start = startIdx + i + 1;
+        if (indexList[start] != -1 && indexList[start + 1] != -1) {
+            end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
+            score += fModel[static_cast<int32_t>(ModelIndex::kBWStart) + i].geti(
+                inString.tempSubString(indexList[start], end - indexList[start]));
+        }
      }
-    if (elementList[3] != INVALID && elementList[4] != INVALID && elementList[5] != INVALID) {
-        score += (2 * fModel.geti(feature.setTo(u"TW4:", 4)
-                                      .append(elementList[3])
-                                      .append(elementList[4])
-                                      .append(elementList[5])));
+    for (int i = 0; i < 4; i++) {
+        // TW1 ~ TW4
+        start = startIdx + i;
+        if (indexList[start] != -1 && indexList[start + 1] != -1 && indexList[start + 2] != -1) {
+            end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
+            score += fModel[static_cast<int32_t>(ModelIndex::kTWStart) + i].geti(
+                inString.tempSubString(indexList[start], end - indexList[start]));
+        }
      }
+
      if (score > 0) {
-        boundary.addElement(index, status);
+        boundary.addElement(startIdx + 1, status);
          numBreaks++;
      }
+    return numBreaks;
  }
  
-int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList,
-                                         UErrorCode &status) const {
+int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList,
+                                     UErrorCode &status) const {
      if (U_FAILURE(status)) {
          return 0;
      }
      int32_t index = 0;
      int32_t length = inString.countChar32();
-    UChar32 w1, w2, w3, w4, w5, w6;
-    w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+    // Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff.
+    uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t));
      if (length > 0) {
-        w3 = inString.char32At(0);
-        index += U16_LENGTH(w3);
+        indexList[2] = 0;
+        index = U16_LENGTH(inString.char32At(0));
          if (length > 1) {
-            w4 = inString.char32At(index);
-            index += U16_LENGTH(w4);
+            indexList[3] = index;
+            index += U16_LENGTH(inString.char32At(index));
              if (length > 2) {
-                w5 = inString.char32At(index);
-                index += U16_LENGTH(w5);
+                indexList[4] = index;
+                index += U16_LENGTH(inString.char32At(index));
                  if (length > 3) {
-                    w6 = inString.char32At(index);
-                    index += U16_LENGTH(w6);
+                    indexList[5] = index;
+                    index += U16_LENGTH(inString.char32At(index));
                  }
              }
          }
      }
-    elementList[0] = w1;
-    elementList[1] = w2;
-    elementList[2] = w3;
-    elementList[3] = w4;
-    elementList[4] = w5;
-    elementList[5] = w6;
-
      return index;
  }
  
  void MlBreakEngine::loadMLModel(UErrorCode &error) {
-    // BudouX's model consists of pairs of the feature and its score.
-    // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
-    // corresponding feature's score.
+    // BudouX's model consists of thirteen categories, each of which is make up of pairs of the
+    // feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and
+    // value to represent the feature and the corresponding score respectively.
+
+    if (U_FAILURE(error)) return;
  
+    UnicodeString key;
+    StackUResourceBundle stackTempBundle;
+    ResourceDataValue modelKey;
+
+    LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
+    UResourceBundle *rb = rbp.getAlias();
      if (U_FAILURE(error)) return;
  
+    int32_t index = 0;
+    initKeyValue(rb, "UW1Keys", "UW1Values", fModel[index++], error);
+    initKeyValue(rb, "UW2Keys", "UW2Values", fModel[index++], error);
+    initKeyValue(rb, "UW3Keys", "UW3Values", fModel[index++], error);
+    initKeyValue(rb, "UW4Keys", "UW4Values", fModel[index++], error);
+    initKeyValue(rb, "UW5Keys", "UW5Values", fModel[index++], error);
+    initKeyValue(rb, "UW6Keys", "UW6Values", fModel[index++], error);
+    initKeyValue(rb, "BW1Keys", "BW1Values", fModel[index++], error);
+    initKeyValue(rb, "BW2Keys", "BW2Values", fModel[index++], error);
+    initKeyValue(rb, "BW3Keys", "BW3Values", fModel[index++], error);
+    initKeyValue(rb, "TW1Keys", "TW1Values", fModel[index++], error);
+    initKeyValue(rb, "TW2Keys", "TW2Values", fModel[index++], error);
+    initKeyValue(rb, "TW3Keys", "TW3Values", fModel[index++], error);
+    initKeyValue(rb, "TW4Keys", "TW4Values", fModel[index++], error);
+    fNegativeSum /= 2;
+}
+
+void MlBreakEngine::initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
+                                 Hashtable &model, UErrorCode &error) {
      int32_t keySize = 0;
      int32_t valueSize = 0;
      int32_t stringLength = 0;
@@ -248,15 +243,13 @@ void MlBreakEngine::loadMLModel(UErrorCode &error) {
      StackUResourceBundle stackTempBundle;
      ResourceDataValue modelKey;
  
-    LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
-    UResourceBundle* rb = rbp.orphan();
      // get modelValues
-    LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
-    const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
+    LocalUResourceBundlePointer modelValue(ures_getByKey(rb, valueName, nullptr, &error));
+    const int32_t *value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
      if (U_FAILURE(error)) return;
  
      // get modelKeys
-    ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
+    ures_getValueWithFallback(rb, keyName, stackTempBundle.getAlias(), modelKey, error);
      ResourceArray stringArray = modelKey.getArray(error);
      keySize = stringArray.getSize();
      if (U_FAILURE(error)) return;
@@ -267,7 +260,7 @@ void MlBreakEngine::loadMLModel(UErrorCode &error) {
          if (U_SUCCESS(error)) {
              U_ASSERT(idx < valueSize);
              fNegativeSum -= value[idx];
-            fModel.puti(key, value[idx], error);
+            model.puti(key, value[idx], error);
          }
      }
  }
diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h

index 2f0edd6c4f26e4564dc42326b6e92a36a2353723..38de47e5f5702be38ecdc71b266097ba07cf6e72 100644 (file)
--- a/icu4c/source/common/mlbe.h
+++ b/icu4c/source/common/mlbe.h
@@ -5,6 +5,7 @@
  #define MLBREAKENGINE_H
  
  #include "hash.h"
+#include "unicode/resbund.h"
  #include "unicode/uniset.h"
  #include "unicode/utext.h"
  #include "uvectr32.h"
@@ -27,7 +28,7 @@ class MlBreakEngine : public UMemory {
       * @param status Information on any errors encountered.
       */
      MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
-                    const UnicodeSet &closePunctuationSet, UErrorCode &status);
+                  const UnicodeSet &closePunctuationSet, UErrorCode &status);
  
      /**
       * Virtual destructor.
@@ -60,31 +61,50 @@ class MlBreakEngine : public UMemory {
      void loadMLModel(UErrorCode &error);
  
      /**
-     * Initialize the element list from the input string.
+     * In the machine learning's model file, specify the name of the key and value to load the
+     * corresponding feature and its score.
+     *
+     * @param rb A ResouceBundle corresponding to the model file.
+     * @param keyName The kay name in the model file.
+     * @param valueName The value name in the model file.
+     * @param model A hashtable to store the pairs of the feature and its score.
+     * @param error Information on any errors encountered.
+     */
+    void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
+                      Hashtable &model, UErrorCode &error);
+
+    /**
+     * Initialize the index list from the input string.
       *
       * @param inString A input string to be segmented.
-     * @param elementList A list to store the first six characters.
+     * @param indexList A code unit index list of inString.
       * @param status Information on any errors encountered.
-     * @return The number of code units of the first six characters in inString.
+     * @return The number of code units of the first four characters in inString.
       */
-    int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
-                            UErrorCode &status) const;
+    int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
+                          UErrorCode &status) const;
  
      /**
       * Evaluate whether the index is a potential breakpoint.
       *
-     * @param elementList A list including six elements for the breakpoint evaluation.
-     * @param index The breakpoint index to be evaluated.
+     * @param inString A input string to be segmented.
+     * @param indexList A code unit index list of the inString.
+     * @param startIdx The start index of the indexList.
+     * @param numCodeUnits  The current code unit boundary of the indexList.
       * @param numBreaks The accumulated number of breakpoints.
       * @param boundary A vector including the index of the breakpoint.
       * @param status Information on any errors encountered.
+     * @return The number of breakpoints
       */
-    void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
-                            UVector32 &boundary, UErrorCode &status) const;
+    int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
+                               int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
+                               UErrorCode &status) const;
+
+    void printUnicodeString(const UnicodeString &s) const;
  
      UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
      UnicodeSet fClosePunctuationSet;
-    Hashtable fModel;
+    Hashtable fModel[13];  // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
      int32_t fNegativeSum;
  };
  
diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt

index 4ddea6c78b2ab277b1c0fbf1aa4f2419e50dff87..f931331229d3f2449a2fdc8a5f910892adee5615 100644 (file)
--- a/icu4c/source/data/brkitr/adaboost/jaml.txt
+++ b/icu4c/source/data/brkitr/adaboost/jaml.txt
@@ -1,728 +1,776 @@
  // © 2022 and later: Unicode, Inc. and others.
  // License & terms of use: http://www.unicode.org/copyright.html
  jaml {
-    modelKeys {
-        "UW3:、",
-        "UW3:。",
-        "UW4:の",
-        "UW4:、",
-        "UW3:の",
-        "UW4:。",
-        "UW3:に",
-        "UW5:。",
-        "UW4:て",
-        "UW3:は",
-        "UW4:に",
-        "UW3:を",
-        "UW5:、",
-        "UW2:、",
-        "UW3:が",
-        "UW2:。",
-        "UW4:で",
-        "UW3:と",
-        "UW4:は",
-        "UW4:が",
-        "UW4:る",
-        "UW4:っ",
-        "UW3:も",
-        "UW5:な",
-        "UW3:で",
-        "UW3:る",
-        "UW5:で",
-        "UW4:を",
-        "UW4:か",
-        "UW3:っ",
-        "UW2:の",
-        "UW4:と",
-        "UW5:っ",
-        "UW4:な",
-        "UW3:て",
-        "UW4:た",
-        "UW4:こ",
-        "UW6:に",
-        "UW4:ら",
-        "UW3:た",
-        "UW2:を",
-        "UW3:ら",
-        "UW6:。",
-        "UW4:し",
-        "UW3:な",
-        "UW2:に",
-        "UW4:い",
-        "UW4:り",
-        "UW6:う",
-        "UW3:う",
-        "UW3:く",
-        "UW4:れ",
-        "UW2:は",
-        "UW4:だ",
-        "UW4:う",
-        "UW3:い",
-        "UW6:い",
-        "UW4:ん",
-        "UW2:か",
-        "UW4:ー",
-        "UW6:を",
-        "UW2:も",
-        "UW5:き",
-        "UW3:り",
-        "UW6:で",
-        "UW2:る",
-        "UW2:と",
-        "UW3:］",
-        "UW4:そ",
-        "UW3:，",
-        "UW4:も",
-        "UW4:く",
-        "UW3:か",
-        "BW2:とい",
-        "UW4:お",
-        "UW4:ま",
-        "UW6:が",
-        "UW4:き",
-        "UW2:し",
-        "UW2:て",
-        "UW3:！",
-        "UW2:ま",
-        "UW5:に",
-        "UW3:や",
-        "UW6:て",
-        "BW3:もの",
-        "UW6:の",
-        "UW2:ん",
-        "UW2:が",
-        "UW5:が",
-        "BW1:いう",
-        "UW2:で",
-        "UW5:す",
-        "UW3:？",
-        "UW5:と",
-        "UW6:は",
-        "UW3:．",
-        "UW4:「",
-        "UW3:ば",
-        "UW5:ん",
-        "BW3:いう",
-        "UW4:す",
-        "BW1:から",
-        "UW3:ど",
-        "UW5:し",
-        "UW2:っ",
-        "UW4:思",
-        "UW3:…",
-        "UW5:る",
-        "BW2:てい",
-        "BW3:よう",
-        "UW5:え",
-        "UW4:私",
-        "UW3:・",
-        "UW4:人",
-        "UW5:く",
-        "UW3:）",
-        "UW4:京",
-        "BW2:ない",
-        "UW3:ー",
-        "BW3:とこ",
-        "UW5:は",
-        "UW4:」",
-        "UW2:一",
-        "UW4:よ",
-        "BW3:こと",
-        "UW5:ー",
-        "UW6:し",
-        "UW4:け",
-        "BW1:ない",
-        "BW2:です",
-        "UW4:一",
-        "UW5:帯",
-        "UW5:を",
-        "UW6:な",
-        "UW5:べ",
-        "BW3:いい",
-        "BW2:であ",
-        "BW2:ので",
-        "UW4:，",
-        "UW5:れ",
-        "UW5:ろ",
-        "UW1:そ",
-        "UW5:い",
-        "UW1:い",
-        "UW5:・",
-        "UW5:わ",
-        "UW4:１",
-        "UW5:う",
-        "UW4:大",
-        "UW3:ま",
-        "BW2:とこ",
-        "UW4:！",
-        "UW4:見",
-        "UW4:行",
-        "BW1:こと",
-        "UW1:な",
-        "UW2:さ",
-        "UW3:☆",
-        "UW4:さ",
-        "UW2:よ",
-        "BW1:とか",
-        "UW4:（",
-        "BW3:でも",
-        "UW5:の",
-        "UW4:・",
-        "UW5:た",
-        "UW1:す",
-        "UW5:か",
-        "UW4:使",
-        "UW3:♪",
-        "UW4:え",
-        "UW4:今",
-        "BW2:、と",
-        "BW3:とき",
-        "UW4:ろ",
-        "UW5:つ",
-        "UW1:に",
-        "UW5:じ",
-        "UW1:で",
-        "UW4:ン",
-        "UW3:ず",
-        "BW3:して",
-        "UW4:食",
-        "UW4:気",
-        "UW4:時",
-        "UW3:日",
-        "BW1:しい",
-        "UW4:自",
-        "UW3:笑",
-        "UW2:毎",
-        "TW1:という",
-        "UW4:み",
-        "UW4:…",
-        "TW2:ではな",
-        "UW6:さ",
-        "UW5:め",
-        "UW2:少",
-        "UW5:あ",
-        "UW4:２",
-        "UW3:へ",
-        "TW3:という",
-        "UW4:何",
-        "UW2:く",
-        "UW2:結",
-        "BW1:うな",
-        "BW1:もう",
-        "UW1:が",
-        "UW4:じ",
-        "UW2:う",
-        "UW4:ル",
-        "UW3:」",
-        "BW1:とが",
-        "UW2:最",
-        "BW1:るの",
-        "UW3:間",
-        "UW6:た",
-        "UW3:つ",
-        "UW4:ど",
-        "UW1:と",
-        "UW3:ん",
-        "UW4:．",
-        "UW3:だ",
-        "UW4:わ",
-        "UW4:最",
-        "UW4:？",
-        "UW3:ろ",
-        "UW4:ば",
-        "TW3:ている",
-        "BW3:この",
-        "UW5:も",
-        "UW3:人",
-        "BW3:とい",
-        "UW4:つ",
-        "BW3:その",
-        "BW3:もう",
-        "UW2:そ",
-        "BW2:には",
-        "BW3:かけ",
-        "TW4:の京都",
-        "TW4:ところ",
-        "UW3:京",
-        "UW4:携",
-        "BW1:かも",
-        "BW1:では",
-        "UW4:ち",
-        "UW3:分",
-        "UW4:べ",
-        "BW3:ころ",
-        "UW3:ゃ",
-        "UW2:す",
-        "BW1:。・",
-        "UW3:電",
-        "BW3:なっ",
-        "UW3:す",
-        "BW1:最近",
-        "UW4:め",
-        "UW3:ぐ",
-        "UW2:お",
-        "BW3:そし",
-        "BW1:かし",
-        "BW1:同じ",
-        "BW3:メー",
-        "UW5:て",
-        "UW6:り",
-        "TW4:くらい",
-        "UW3:今",
-        "UW5:そ",
-        "UW4:や",
-        "UW5:」",
-        "UW4:帯",
-        "UW6:ー",
-        "BW2:とし",
-        "TW1:ような",
-        "BW2:てお",
-        "UW4:笑",
-        "UW1:は",
-        "BW3:かか",
-        "TW4:かなり",
-        "UW4:）",
-        "BW1:んな",
-        "UW1:ち",
-        "TW2:気に入",
-        "TW1:・・・",
-        "UW6:と",
-        "UW5:ち",
-        "BW3:ため",
-        "UW4:ず",
-        "UW3:０",
-        "BW1:んで",
-        "UW3:中",
-        "UW3:々",
-        "BW2:のよ",
-        "BW2:帯電",
-        "BW2:でも",
-        "BW1:には",
-        "BW3:ちょ",
-        "UW4:せ",
-        "UW3:度",
-        "BW1:でも",
-        "BW1:が、",
-        "UW2:な",
-        "UW5:思",
-        "UW6:０",
-        "UW6:寺",
-        "BW3:とて",
-        "BW3:ある",
-        "BW2:もし",
-        "UW4:ッ",
-        "UW1:て",
-        "BW2:にも",
-        "BW1:れた",
-        "UW4:ひ",
-        "TW3:ること",
-        "BW1:てい",
-        "UW4:』",
-        "BW1:だけ",
-        "UW3:お",
-        "BW1:少し",
-        "TW3:、ある",
-        "UW5:！",
-        "UW6:ル",
-        "UW2:多",
-        "UW6:ご",
-        "UW6:や",
-        "UW3:後",
-        "BW2:てみ",
-        "BW1:とき",
-        "UW4:ゃ",
-        "BW1:たい",
-        "UW3:き",
-        "TW4:ことが",
-        "UW3:真",
-        "BW2:など",
-        "UW6:ぱ",
-        "BW1:った",
-        "BW1:ても",
-        "UW5:日",
-        "BW1:たと",
-        "UW4:］",
-        "UW3:ッ",
-        "TW4:メール",
-        "BW2:はな",
-        "BW3:・・",
-        "BW3:なる",
-        "BW1:とい",
-        "UW2:全",
-        "BW1:にも",
-        "BW1:たら",
-        "BW2:くな",
-        "UW3:「",
-        "BW1:その",
-        "UW3:観",
-        "BW1:うに",
-        "UW3:イ",
-        "BW3:もん",
-        "UW5:ず",
-        "BW3:しま",
-        "BW1:より",
-        "UW5:分",
+    BW1Keys {
+        "。・",
+        "いう",
+        "うな",
+        "うに",
+        "かし",
+        "かも",
+        "から",
+        "が、",
+        "こと",
+        "しい",
+        "その",
+        "たい",
+        "たと",
+        "たら",
+        "だけ",
+        "った",
+        "てい",
+        "ても",
+        "では",
+        "でも",
+        "とい",
+        "とか",
+        "とが",
+        "とき",
+        "ない",
+        "には",
+        "にも",
+        "もう",
+        "より",
+        "るの",
+        "れた",
+        "んで",
+        "んな",
+        "同じ",
+        "少し",
+        "最近",
      }
-    modelValues:intvector {
-        3634,
-        4347,
-        -2581,
-        -4812,
-        2538,
-        -4206,
-        2701,
-        -1455,
-        -2403,
-        2977,
-        -2678,
-        4165,
-        -818,
-        -1011,
-        2996,
-        -904,
-        -1808,
-        2064,
-        -2164,
-        -2180,
-        -2760,
-        -2310,
-        2360,
-        -388,
-        1842,
-        1706,
-        -706,
-        -2408,
-        -1628,
-        -1005,
-        -434,
-        -1442,
-        543,
-        -1091,
-        1355,
-        -1056,
-        258,
-        277,
-        -2999,
-        1331,
-        -1305,
-        1242,
-        -337,
-        -1073,
-        1392,
-        -576,
-        -886,
-        -2405,
-        -386,
-        1031,
-        1470,
-        -2105,
-        -594,
-        -1461,
-        -1160,
-        964,
-        -48,
-        -2158,
-        110,
-        -1750,
-        228,
-        -603,
-        801,
-        972,
-        102,
-        -395,
-        -508,
-        1640,
-        191,
-        2468,
-        -1580,
-        -1529,
-        1148,
-        515,
-        539,
-        -774,
-        111,
-        -1275,
-        113,
-        -432,
-        1736,
-        588,
-        -413,
-        1360,
-        49,
-        2322,
-        48,
-        255,
-        -521,
-        -366,
+    BW1Values:intvector {
+        567,
          529,
-        -493,
-        -557,
-        1719,
-        -476,
-        104,
-        1311,
-        1314,
-        1307,
-        520,
-        666,
-        -412,
+        280,
+        -13,
+        468,
+        -533,
          627,
-        1098,
-        -209,
-        163,
-        955,
-        1798,
+        192,
+        -695,
+        423,
+        -26,
+        53,
+        -52,
+        13,
+        122,
+        13,
+        -67,
+        39,
+        -91,
+        95,
+        -13,
+        784,
+        -679,
+        91,
+        485,
+        109,
+        26,
+        767,
+        26,
+        -407,
+        95,
+        -206,
+        102,
+        438,
+        134,
+        365,
+    }
+    BW2Keys {
+        "、と",
+        "くな",
+        "てい",
+        "てお",
+        "てみ",
+        "であ",
+        "です",
+        "でも",
+        "とい",
+        "とこ",
+        "とし",
+        "ない",
+        "など",
+        "には",
+        "にも",
+        "ので",
+        "のよ",
+        "はな",
+        "もし",
+        "帯電",
+    }
+    BW2Values:intvector {
+        -517,
          -39,
          -753,
-        -1262,
-        411,
-        1247,
-        914,
-        522,
-        348,
-        2156,
-        510,
+        -558,
+        -92,
+        -1495,
+        -1445,
+        -207,
+        515,
+        -1044,
+        143,
          -1522,
-        -243,
-        1337,
-        -378,
-        -1957,
-        834,
-        -450,
+        -64,
+        -426,
+        -120,
+        -756,
+        -207,
+        -26,
+        -67,
+        -224,
+    }
+    BW3Keys {
+        "ある",
+        "いい",
+        "いう",
+        "かか",
+        "かけ",
+        "こと",
+        "この",
+        "ころ",
+        "して",
+        "しま",
+        "そし",
+        "その",
+        "ため",
+        "ちょ",
+        "でも",
+        "とい",
+        "とき",
+        "とこ",
+        "とて",
+        "なっ",
+        "なる",
+        "もう",
+        "もの",
+        "もん",
+        "よう",
+        "メー",
+        "・・",
+    }
+    BW3Values:intvector {
+        -28,
+        647,
+        666,
+        456,
+        720,
          235,
-        87,
-        236,
-        -1615,
-        485,
-        -1445,
-        488,
          404,
          -333,
-        66,
-        787,
-        647,
-        -1495,
-        -756,
-        -1700,
-        279,
+        249,
+        -13,
+        -526,
+        502,
+        294,
+        316,
+        767,
+        -277,
+        799,
+        1337,
+        230,
+        -309,
+        13,
+        766,
+        2322,
+        39,
+        -1262,
+        136,
+        -39,
+    }
+    TW1Keys {
+        "という",
+        "ような",
+        "・・・",
+    }
+    TW1Values:intvector {
+        292,
+        361,
+        325,
+    }
+    TW2Keys {
+        "ではな",
+        "気に入",
+    }
+    TW2Values:intvector {
+        -814,
+        -466,
+    }
+    TW3Keys {
+        "、ある",
+        "ている",
+        "という",
+        "ること",
+    }
+    TW3Values:intvector {
+        -200,
+        -389,
+        387,
          -81,
-        260,
-        162,
+    }
+    TW4Keys {
+        "かなり",
+        "くらい",
+        "ことが",
+        "ところ",
+        "の京都",
+        "メール",
+    }
+    TW4Values:intvector {
+        441,
+        585,
          -51,
-        -851,
-        462,
-        493,
-        161,
-        396,
-        -238,
-        -1044,
-        -1685,
-        433,
-        276,
-        -695,
+        422,
+        1005,
+        26,
+    }
+    UW1Keys {
+        "い",
+        "が",
+        "す",
+        "そ",
+        "ち",
+        "て",
+        "で",
+        "と",
+        "な",
+        "に",
+        "は",
+    }
+    UW1Values:intvector {
+        -51,
+        -53,
+        152,
+        260,
+        112,
+        14,
+        -56,
+        36,
          -148,
+        -118,
+        -56,
+    }
+    UW2Keys {
+        "、",
+        "。",
+        "う",
+        "お",
+        "か",
+        "が",
+        "く",
+        "さ",
+        "し",
+        "す",
+        "そ",
+        "っ",
+        "て",
+        "で",
+        "と",
+        "な",
+        "に",
+        "の",
+        "は",
+        "ま",
+        "も",
+        "よ",
+        "る",
+        "を",
+        "ん",
+        "一",
+        "全",
+        "多",
+        "少",
+        "最",
+        "毎",
+        "結",
+    }
+    UW2Values:intvector {
+        -1011,
+        -904,
+        -191,
+        -235,
+        110,
+        -521,
+        -183,
          416,
-        1235,
-        -748,
+        113,
+        31,
+        -182,
+        163,
+        -432,
+        -493,
+        -508,
+        -40,
+        -576,
+        -434,
+        -594,
+        588,
+        -603,
          257,
-        784,
-        748,
-        767,
-        -262,
-        -490,
-        -26,
-        152,
-        186,
-        544,
+        -395,
+        -1305,
+        255,
+        834,
+        39,
+        67,
+        571,
+        279,
+        628,
+        661,
+    }
+    UW3Keys {
+        "…",
+        "☆",
+        "♪",
+        "、",
+        "。",
+        "々",
+        "「",
+        "」",
+        "い",
+        "う",
+        "お",
+        "か",
+        "が",
+        "き",
+        "く",
+        "ぐ",
+        "す",
+        "ず",
+        "た",
+        "だ",
+        "っ",
+        "つ",
+        "て",
+        "で",
+        "と",
+        "ど",
+        "な",
+        "に",
+        "の",
+        "は",
+        "ば",
+        "へ",
+        "ま",
+        "も",
+        "ゃ",
+        "や",
+        "ら",
+        "り",
+        "る",
+        "ろ",
+        "を",
+        "ん",
+        "イ",
+        "ッ",
+        "・",
+        "ー",
+        "中",
+        "京",
+        "人",
+        "今",
+        "分",
+        "度",
+        "後",
+        "日",
+        "真",
+        "笑",
+        "観",
+        "間",
+        "電",
+        "！",
+        "）",
+        "，",
+        "．",
+        "０",
+        "？",
+        "］",
+    }
+    UW3Values:intvector {
+        1798,
+        1235,
          1035,
-        -711,
-        549,
-        -517,
-        799,
-        -1024,
-        542,
-        -118,
-        432,
-        -56,
-        -694,
+        3634,
+        4347,
+        209,
+        -26,
+        526,
+        964,
+        1031,
+        -81,
+        1148,
+        2996,
+        40,
+        1470,
+        411,
+        251,
          668,
-        249,
-        175,
-        329,
-        305,
+        1331,
+        424,
+        -1005,
+        365,
+        1355,
+        1842,
+        2064,
+        1098,
+        1392,
+        2701,
+        2538,
+        2977,
+        1307,
+        701,
+        -238,
+        2360,
+        434,
+        1360,
+        1242,
+        972,
+        1706,
+        452,
+        4165,
+        284,
+        -13,
+        -52,
+        914,
+        -243,
+        252,
+        -396,
+        415,
+        324,
+        333,
+        222,
+        118,
          287,
-        423,
-        438,
+        39,
          934,
-        628,
-        292,
-        -536,
-        -995,
-        -814,
-        237,
-        263,
-        571,
-        -138,
-        402,
-        701,
-        387,
-        474,
-        -183,
-        661,
-        280,
-        767,
-        -53,
-        -793,
-        -191,
-        -401,
-        526,
-        -679,
-        279,
-        -407,
+        -26,
          493,
-        -82,
-        365,
+        -356,
+        1736,
+        2156,
+        2468,
+        1311,
+        -224,
+        1719,
+        1640,
+    }
+    UW4Keys {
+        "…",
+        "、",
+        "。",
+        "「",
+        "」",
+        "』",
+        "い",
+        "う",
+        "え",
+        "お",
+        "か",
+        "が",
+        "き",
+        "く",
+        "け",
+        "こ",
+        "さ",
+        "し",
+        "じ",
+        "す",
+        "ず",
+        "せ",
+        "そ",
+        "た",
+        "だ",
+        "ち",
+        "っ",
+        "つ",
+        "て",
+        "で",
+        "と",
+        "ど",
+        "な",
+        "に",
+        "の",
+        "は",
+        "ば",
+        "ひ",
+        "べ",
+        "ま",
+        "み",
+        "め",
+        "も",
+        "ゃ",
+        "や",
+        "よ",
+        "ら",
+        "り",
+        "る",
+        "れ",
+        "ろ",
+        "わ",
+        "を",
+        "ん",
+        "ッ",
+        "ル",
+        "ン",
+        "・",
+        "ー",
+        "一",
+        "京",
+        "人",
+        "今",
+        "何",
+        "使",
+        "大",
+        "帯",
+        "思",
+        "携",
+        "時",
+        "最",
+        "気",
+        "私",
+        "笑",
+        "自",
+        "行",
+        "見",
+        "食",
+        "！",
+        "（",
+        "）",
+        "，",
+        "．",
+        "１",
+        "２",
+        "？",
+        "］",
+    }
+    UW4Values:intvector {
+        -995,
+        -4812,
+        -4206,
+        1314,
+        -1957,
+        -296,
+        -886,
+        -1160,
+        -711,
+        539,
+        -1628,
+        -2180,
+        -1275,
+        -1529,
+        -1615,
+        258,
+        -748,
+        -1073,
+        -793,
+        -412,
+        -321,
+        -234,
+        191,
+        -1056,
+        -1461,
+        -355,
+        -2310,
+        -400,
+        -2403,
+        -1808,
+        -1442,
          -334,
-        36,
-        284,
-        -813,
-        424,
-        -425,
-        423,
-        -796,
-        452,
+        -1091,
+        -2678,
+        -2581,
+        -2164,
          -635,
-        -389,
-        404,
-        -141,
-        415,
-        -277,
-        -400,
-        502,
-        766,
-        -182,
-        -426,
-        720,
-        1005,
-        422,
-        -396,
-        123,
-        -533,
-        -91,
-        -355,
-        333,
+        122,
          -596,
-        -333,
-        434,
-        31,
-        567,
-        -356,
-        -309,
-        251,
-        365,
+        -774,
+        -536,
          -399,
-        411,
-        -235,
-        -526,
-        468,
-        438,
-        136,
-        103,
-        74,
-        585,
-        324,
-        -115,
+        -1580,
+        -105,
          -219,
-        -217,
+        -450,
+        -2999,
+        -2405,
+        -2760,
+        -2105,
+        -1024,
+        -425,
+        -2408,
+        -2158,
+        -149,
+        -401,
+        -694,
+        -490,
+        -1750,
+        488,
+        510,
+        522,
+        549,
+        474,
+        544,
+        396,
          -289,
-        -88,
-        143,
-        361,
-        -558,
+        955,
+        123,
+        305,
+        423,
+        329,
+        1247,
          -614,
-        -56,
-        456,
-        441,
+        438,
+        276,
+        433,
+        175,
+        -1685,
+        748,
          -566,
-        102,
-        112,
-        -466,
-        325,
-        -27,
-        128,
-        294,
-        -321,
-        -224,
-        -206,
-        252,
-        209,
-        -207,
-        -224,
-        -207,
-        109,
-        316,
-        -234,
-        222,
-        95,
-        192,
-        -40,
-        -98,
-        82,
-        68,
-        230,
-        -28,
-        -67,
-        -149,
-        14,
-        -120,
-        95,
-        122,
-        -81,
-        -67,
-        -296,
-        122,
-        -81,
-        134,
-        -200,
-        -67,
-        14,
-        67,
-        119,
-        40,
-        118,
-        -92,
-        91,
-        -105,
-        53,
-        40,
-        -51,
-        39,
-        -64,
-        105,
-        13,
-        39,
-        26,
-        -52,
-        -52,
+        -1700,
+        -813,
+        493,
+        402,
+        -796,
          -52,
+    }
+    UW5Keys {
+        "、",
+        "。",
+        "」",
+        "あ",
+        "い",
+        "う",
+        "え",
+        "か",
+        "が",
+        "き",
+        "く",
+        "し",
+        "じ",
+        "す",
+        "ず",
+        "そ",
+        "た",
+        "ち",
+        "っ",
+        "つ",
+        "て",
+        "で",
+        "と",
+        "な",
+        "に",
+        "の",
+        "は",
+        "べ",
+        "め",
+        "も",
+        "る",
+        "れ",
+        "ろ",
+        "わ",
+        "を",
+        "ん",
+        "・",
+        "ー",
+        "分",
+        "帯",
+        "思",
+        "日",
+        "！",
+    }
+    UW5Values:intvector {
+        -818,
+        -1455,
+        -217,
+        -138,
+        162,
+        161,
+        411,
+        186,
+        -366,
+        801,
+        348,
+        -209,
+        432,
+        -557,
          26,
+        -115,
          -26,
+        128,
+        543,
+        542,
+        103,
+        -706,
+        -476,
+        -388,
+        -413,
+        -262,
+        -378,
+        787,
+        263,
+        -141,
          -39,
+        279,
+        -81,
+        462,
+        -333,
+        520,
+        -851,
+        87,
          13,
-        -13,
-        39,
-        26,
-        13,
-        -39,
-        -26,
-        -26,
-        -26,
-        -13,
-        -13,
-        39,
-        26,
-        -13,
+        404,
+        -98,
          26,
-        13,
+        -67,
+    }
+    UW6Keys {
+        "。",
+        "い",
+        "う",
+        "が",
+        "ご",
+        "さ",
+        "し",
+        "た",
+        "て",
+        "で",
+        "と",
+        "な",
+        "に",
+        "の",
+        "は",
+        "ぱ",
+        "や",
+        "り",
+        "を",
+        "ル",
+        "ー",
+        "寺",
+        "０",
+    }
+    UW6Values:intvector {
+        -337,
+        -48,
+        -386,
+        111,
+        119,
+        237,
+        236,
+        -82,
+        49,
+        102,
+        -27,
+        66,
+        277,
+        48,
+        104,
+        105,
+        40,
+        74,
+        228,
+        14,
+        -88,
+        68,
+        82,
      }
-}
-\ No newline at end of file
+}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java

index 196579d0a58c20978f36ca7f93562bfedcf40f1e..e09c1763d5fb63c53e19161995d8620389f543f0 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java
@@ -8,26 +8,36 @@ import static com.ibm.icu.impl.CharacterIteration.current32;
  import static com.ibm.icu.impl.CharacterIteration.next32;
  import static com.ibm.icu.impl.CharacterIteration.previous32;
  
-import com.ibm.icu.impl.Assert;
  import com.ibm.icu.impl.ICUData;
-import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.util.UResourceBundle;
  import com.ibm.icu.util.UResourceBundleIterator;
  
-import java.lang.System;
  import java.text.CharacterIterator;
+import java.util.Arrays;
  import java.util.ArrayList;
+import java.util.List;
  import java.util.HashMap;
  
-public class MlBreakEngine {
+enum ModelIndex {
+    kUWStart(0), kBWStart(6), kTWStart(9);
+    private final int value;
+
+    private ModelIndex(int value) {
+        this.value = value;
+    }
+
+    public int getValue() {
+        return value;
+    }
+}
  
-    private static final int INVALID = '|';
-    private static final String INVALID_STRING = "|";
+public class MlBreakEngine {
+    // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
      private static final int MAX_FEATURE = 13;
      private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
      private UnicodeSet fClosePunctuationSet;
-    private HashMap<String, Integer> fModel;
+    private List<HashMap<String, Integer>> fModel;
      private int fNegativeSum;
  
      /**
@@ -41,7 +51,10 @@ public class MlBreakEngine {
              UnicodeSet closePunctuationSet) {
          fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet;
          fClosePunctuationSet = closePunctuationSet;
-        fModel = new HashMap<String, Integer>();
+        fModel = new ArrayList<HashMap<String, Integer>>(MAX_FEATURE);
+        for (int i = 0; i < MAX_FEATURE; i++) {
+            fModel.add(new HashMap<String, Integer>());
+        }
          fNegativeSum = 0;
          loadMLModel();
      }
@@ -49,42 +62,47 @@ public class MlBreakEngine {
      /**
       * Divide up a range of characters handled by this break engine.
       *
-     * @param inText        A input text.
-     * @param startPos      The start index of the input text.
-     * @param endPos        The end index of the input text.
-     * @param inString      A input string normalized from inText from startPos to endPos
-     * @param numCodePts    The number of code points of inString
-     * @param charPositions A map that transforms inString's code point index to code unit index.
-     * @param foundBreaks   A list to store the breakpoint.
+     * @param inText          An input text.
+     * @param startPos        The start index of the input text.
+     * @param endPos          The end index of the input text.
+     * @param inString        A input string normalized from inText from startPos to endPos
+     * @param codePointLength The number of code points of inString
+     * @param charPositions   A map that transforms inString's code point index to code unit index.
+     * @param foundBreaks     A list to store the breakpoint.
       * @return The number of breakpoints
       */
      public int divideUpRange(CharacterIterator inText, int startPos, int endPos,
-            CharacterIterator inString, int numCodePts, int[] charPositions,
+            CharacterIterator inString, int codePointLength, int[] charPositions,
              DictionaryBreakEngine.DequeI foundBreaks) {
          if (startPos >= endPos) {
              return 0;
          }
-        ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
-        // The ML model groups six char to evaluate if the 4th char is a breakpoint.
-        // Like a sliding window, the elementList removes the first char and appends the new char
-        // from inString in each iteration so that its size always remains at six.
-        int elementList[] = new int[6];
-        initElementList(inString, elementList, numCodePts);
+        ArrayList<Integer> boundary = new ArrayList<Integer>(codePointLength);
+        String inputStr = transform(inString);
+        // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
+        // In each iteration, it evaluates the 4th char and then moves forward one char like
+        // sliding window. Initially, the first six values in the indexList are
+        // [-1, -1, 0, 1, 2, 3]. After moving forward, finally the last six values in the indexList
+        // are [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra
+        // "-1".
+        int indexSize = codePointLength + 4;
+        int indexList[] = new int[indexSize];
+        int numCodeUnits = initIndexList(inString, indexList, codePointLength);
  
          // Add a break for the start.
          boundary.add(0, 0);
-        for (int i = 1; i < numCodePts; i++) {
-            evaluateBreakpoint(elementList, i, boundary);
-            if (i + 1 > numCodePts) {
-                break;
+
+        for (int idx = 0; idx + 1 < codePointLength; idx++) {
+            evaluateBreakpoint(inputStr, indexList, idx, numCodeUnits, boundary);
+            if (idx + 4 < codePointLength) {
+                indexList[idx + 6] = numCodeUnits;
+                numCodeUnits += Character.charCount(next32(inString));
              }
-            shiftLeftOne(elementList);
-            elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID;
          }
  
          // Add a break for the end if there is not one there already.
-        if (boundary.get(boundary.size() - 1) != numCodePts) {
-            boundary.add(numCodePts);
+        if (boundary.get(boundary.size() - 1) != codePointLength) {
+            boundary.add(codePointLength);
          }
  
          int correctedNumBreaks = 0;
@@ -127,137 +145,94 @@ public class MlBreakEngine {
          return correctedNumBreaks;
      }
  
-    private void shiftLeftOne(int[] elementList) {
-        int length = elementList.length;
-        for (int i = 1; i < length; i++) {
-            elementList[i - 1] = elementList[i];
+    /**
+     * Transform a CharacterIterator into a String.
+     */
+    private String transform(CharacterIterator inString) {
+        StringBuilder sb = new StringBuilder();
+        inString.setIndex(0);
+        for (char c = inString.first(); c != CharacterIterator.DONE; c = inString.next()) {
+            sb.append(c);
          }
+        return sb.toString();
      }
  
      /**
-     * Evaluate whether the index is a potential breakpoint.
+     * Evaluate whether the breakpointIdx is a potential breakpoint.
       *
-     * @param elementList A list including six elements for the breakpoint evaluation.
-     * @param index       The breakpoint index to be evaluated.
-     * @param boundary    An list including the index of the breakpoint.
+     * @param inputStr     An input string to be segmented.
+     * @param indexList    A code unit index list of the inputStr.
+     * @param startIdx     The start index of the indexList.
+     * @param numCodeUnits The current code unit boundary of the indexList.
+     * @param boundary     A list including the index of the breakpoint.
       */
-    private void evaluateBreakpoint(int[] elementList, int index, ArrayList<Integer> boundary) {
-        String[] featureList = new String[MAX_FEATURE];
-        final int w1 = elementList[0];
-        final int w2 = elementList[1];
-        final int w3 = elementList[2];
-        final int w4 = elementList[3];
-        final int w5 = elementList[4];
-        final int w6 = elementList[5];
+    private void evaluateBreakpoint(String inputStr, int[] indexList, int startIdx,
+            int numCodeUnits, ArrayList<Integer> boundary) {
+        int start = 0, end = 0;
+        int score = fNegativeSum;
  
-        StringBuilder sb = new StringBuilder();
-        int idx = 0;
-        if (w1 != INVALID) {
-            featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString();
-        }
-        if (w2 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString();
-        }
-        if (w3 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString();
-        }
-        if (w4 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString();
-        }
-        if (w5 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString();
-        }
-        if (w6 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString();
-        }
-        if (w2 != INVALID && w3 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint(
-                    w3).toString();
-        }
-        if (w3 != INVALID && w4 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint(
-                    w4).toString();
-        }
-        if (w4 != INVALID && w5 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint(
-                    w5).toString();
-        }
-        if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint(
-                    w2).appendCodePoint(w3).toString();
-        }
-        if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint(
-                    w3).appendCodePoint(w4).toString();
-        }
-        if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint(
-                    w4).appendCodePoint(w5).toString();
+        for (int i = 0; i < 6; i++) {
+            // UW1 ~ UW6
+            start = startIdx + i;
+            if (indexList[start] != -1) {
+                end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
+                score += fModel.get(ModelIndex.kUWStart.getValue() + i).getOrDefault(
+                        inputStr.substring(indexList[start], end), 0);
+            }
          }
-        if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
-                    w5).appendCodePoint(w6).toString();
+        for (int i = 0; i < 3; i++) {
+            // BW1 ~ BW3
+            start = startIdx + i + 1;
+            if (indexList[start] != -1 && indexList[start + 1] != -1) {
+                end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
+                score += fModel.get(ModelIndex.kBWStart.getValue() + i).getOrDefault(
+                        inputStr.substring(indexList[start], end), 0);
+            }
          }
-
-        int score = fNegativeSum;
-        for (int j = 0; j < idx; j++) {
-            if (fModel.containsKey(featureList[j])) {
-                score += (2 * fModel.get(featureList[j]));
+        for (int i = 0; i < 4; i++) {
+            // TW1 ~ TW4
+            start = startIdx + i;
+            if (indexList[start] != -1
+                    && indexList[start + 1] != -1
+                    && indexList[start + 2] != -1) {
+                end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
+                score += fModel.get(ModelIndex.kTWStart.getValue() + i).getOrDefault(
+                        inputStr.substring(indexList[start], end), 0);
              }
          }
          if (score > 0) {
-            boundary.add(index);
+            boundary.add(startIdx + 1);
          }
      }
  
      /**
-     * Initialize the element list from the input string.
+     * Initialize the index list from the input string.
       *
-     * @param inString    A input string to be segmented.
-     * @param elementList A list to store the first six characters.
-     * @param numCodePts  The number of code points of input string
+     * @param inString        An input string to be segmented.
+     * @param indexList       A code unit index list of the inString.
+     * @param codePointLength The number of code points of the input string
       * @return The number of the code units of the first six characters in inString.
       */
-    private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) {
+    private int initIndexList(CharacterIterator inString, int[] indexList, int codePointLength) {
          int index = 0;
          inString.setIndex(index);
-        int w1, w2, w3, w4, w5, w6;
-        w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
-        if (numCodePts > 0) {
-            w3 = current32(inString);
-            index += Character.charCount(w3);
-            if (numCodePts > 1) {
-                w4 = next32(inString);
-                index += Character.charCount(w3);
-                if (numCodePts > 2) {
-                    w5 = next32(inString);
-                    index += Character.charCount(w5);
-                    if (numCodePts > 3) {
-                        w6 = next32(inString);
-                        index += Character.charCount(w6);
+        Arrays.fill(indexList, -1);
+        if (codePointLength > 0) {
+            indexList[2] = 0;
+            index += Character.charCount(current32(inString));
+            if (codePointLength > 1) {
+                indexList[3] = index;
+                index += Character.charCount(next32(inString));
+                if (codePointLength > 2) {
+                    indexList[4] = index;
+                    index += Character.charCount(next32(inString));
+                    if (codePointLength > 3) {
+                        indexList[5] = index;
+                        index += Character.charCount(next32(inString));
                      }
                  }
              }
          }
-        elementList[0] = w1;
-        elementList[1] = w2;
-        elementList[2] = w3;
-        elementList[3] = w4;
-        elementList[4] = w5;
-        elementList[5] = w6;
-
          return index;
      }
  
@@ -268,13 +243,41 @@ public class MlBreakEngine {
          int index = 0;
          UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME,
                  "jaml");
-        UResourceBundle keyBundle = rb.get("modelKeys");
-        UResourceBundle valueBundle = rb.get("modelValues");
+        initKeyValue(rb, "UW1Keys", "UW1Values", fModel.get(index++));
+        initKeyValue(rb, "UW2Keys", "UW2Values", fModel.get(index++));
+        initKeyValue(rb, "UW3Keys", "UW3Values", fModel.get(index++));
+        initKeyValue(rb, "UW4Keys", "UW4Values", fModel.get(index++));
+        initKeyValue(rb, "UW5Keys", "UW5Values", fModel.get(index++));
+        initKeyValue(rb, "UW6Keys", "UW6Values", fModel.get(index++));
+        initKeyValue(rb, "BW1Keys", "BW1Values", fModel.get(index++));
+        initKeyValue(rb, "BW2Keys", "BW2Values", fModel.get(index++));
+        initKeyValue(rb, "BW3Keys", "BW3Values", fModel.get(index++));
+        initKeyValue(rb, "TW1Keys", "TW1Values", fModel.get(index++));
+        initKeyValue(rb, "TW2Keys", "TW2Values", fModel.get(index++));
+        initKeyValue(rb, "TW3Keys", "TW3Values", fModel.get(index++));
+        initKeyValue(rb, "TW4Keys", "TW4Values", fModel.get(index++));
+        fNegativeSum /= 2;
+    }
+
+    /**
+     * In the machine learning's model file, specify the name of the key and value to load the
+     * corresponding feature and its score.
+     *
+     * @param rb        A RedouceBundle corresponding to the model file.
+     * @param keyName   The kay name in the model file.
+     * @param valueName The value name in the model file.
+     * @param map       A HashMap to store the pairs of the feature and its score.
+     */
+    private void initKeyValue(UResourceBundle rb, String keyName, String valueName,
+            HashMap<String, Integer> map) {
+        int idx = 0;
+        UResourceBundle keyBundle = rb.get(keyName);
+        UResourceBundle valueBundle = rb.get(valueName);
          int[] value = valueBundle.getIntVector();
          UResourceBundleIterator iterator = keyBundle.getIterator();
          while (iterator.hasNext()) {
-            fNegativeSum -= value[index];
-            fModel.put(iterator.nextString(), value[index++]);
+            fNegativeSum -= value[idx];
+            map.put(iterator.nextString(), value[idx++]);
          }
      }
  }
author	allenwtsu <allenwtsu@google.com>
	Tue, 31 Jan 2023 10:17:02 +0000 (18:17 +0800)
committer	Frank Yung-Fong Tang <ftang@google.com>
	Fri, 3 Feb 2023 21:07:53 +0000 (13:07 -0800)
icu4c/source/common/mlbe.cpp		patch \| blob \| history
icu4c/source/common/mlbe.h		patch \| blob \| history
icu4c/source/data/brkitr/adaboost/jaml.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java		patch \| blob \| history