ICU-22100 Incorporate BudouX into ICU (Java)

author allenwtsu <allenwtsu@google.com>

Tue, 20 Dec 2022 16:34:42 +0000 (16:34 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 20 Dec 2022 22:27:04 +0000 (14:27 -0800)
author allenwtsu <allenwtsu@google.com>
Tue, 20 Dec 2022 16:34:42 +0000 (16:34 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 20 Dec 2022 22:27:04 +0000 (14:27 -0800)
diff --git a/.github/workflows/icu_ci.yml b/.github/workflows/icu_ci.yml

index 2153fe1fa548f0ecbd8ea94404ff7f13eb935531..dd3222d821ec8924f2cc2d343fd1b5987decce26 100644 (file)
--- a/.github/workflows/icu_ci.yml
+++ b/.github/workflows/icu_ci.yml
@@ -190,6 +190,38 @@ jobs:
            [ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
          if: ${{ failure() }}
  
+  # ICU4J build and unit test under adaboost
+  adaboost-icu4j-build-and-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout and setup
+        uses: actions/checkout@v2
+        with:
+          lfs: true
+      - name: Checkout lfs objects
+        run: git lfs pull
+      - uses: actions/setup-java@v3
+        with:
+          distribution: 'temurin'
+          java-version: '11'
+      - name: Config Adaboost and Rebuild data jar
+        run: |
+          cd icu4c/source;
+          ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
+          make clean;
+          make -j2 ICU4J_ROOT=../../../icu4j icu4j-data-install;
+          cd ../..
+      - name: ICU4J
+        run: |
+          cd icu4j;
+          ant init;
+          ant -Dcom.ibm.icu.impl.breakiter.useMLPhraseBreaking=true check;
+          ant localespiCheck
+      - name: List failures (if any)
+        run: |
+          [ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
+        if: ${{ failure() }}
+
    # gcc debug build.
    # Includes dependency checker.
    # Note - the dependency checker needs to be run on both a debug and an optimized build.
diff --git a/icu4j/build.xml b/icu4j/build.xml

index 566d10f6f0276c0d31fce48d09825b43204b3672..b9027dbc7a19158e3eb4814298ee836a989351b6 100644 (file)
--- a/icu4j/build.xml
+++ b/icu4j/build.xml
@@ -338,11 +338,13 @@
              <!--set the property - if it was set before it won't override-->
              <property name="user-jvm-options" value=""/>
              <property name="internal-jvm-options" value=""/>
+            <property name="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value=""/>
              <delete dir="${junit.out.dir}/@{test-name}"/>
              <mkdir  dir="${junit.out.dir}/@{test-name}"/>
  
              <junit fork="yes" forkmode="once" printsummary="yes" haltonfailure="no"
                  failureproperty="@{failure-status}" tempdir="${junit.out.dir}">
+                <sysproperty key="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value="${com.ibm.icu.impl.breakiter.useMLPhraseBreaking}" />
                  <jvmarg value="-Xss4m"/>
                  <jvmarg value="-ea"/>
                  <jvmarg value="-Djava.awt.headless=true"/>
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties b/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties

index bc56ef6cf0dbe8d2c0976f331be2ce50e6c50070..e6c585f40c0fe2a3d02fdc0f483f4d3e1426b0d9 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties
+++ b/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties
@@ -63,3 +63,9 @@ com.ibm.icu.impl.ICUResourceBundle.skipRuntimeLocaleResourceScan = false
  # LocaleDisplayNames implementation class
  # @internal
  # com.ibm.icu.text.LocaleDisplayNames.impl = com.ibm.icu.impl.LocaleDisplayNamesImpl
+
+#
+# [Internal Use Only]
+# Enable ML phrase breaking
+# @internal
+com.ibm.icu.impl.breakiter.useMLPhraseBreaking = false
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java

index 61018c356db81131f263f12bc6bb28267a0855ba..6ca912cb2c503bbb55b5f80f14753fabbbd7b37d 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java
@@ -76,7 +76,7 @@ public class ICUConfig {
              val = System.getProperty(name);
          }
  
-        if (val == null) {
+        if (val == null || val.equals("")) {
              val = CONFIG_PROPS.getProperty(name, def);
          }
          return val;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java

index ee66c46da0c31079fa6eb06e5719fe73fe43e315..cf8da008d5d9b10a5a501add47f415f147781ec4 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java
@@ -18,6 +18,7 @@ import java.text.CharacterIterator;
  import java.util.HashSet;
  
  import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUConfig;
  import com.ibm.icu.impl.ICUData;
  import com.ibm.icu.text.Normalizer;
  import com.ibm.icu.text.UnicodeSet;
@@ -31,6 +32,8 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
      private UnicodeSet fClosePunctuationSet;
      private DictionaryMatcher fDictionary = null;
      private HashSet<String> fSkipSet;
+    private MlBreakEngine fMlBreakEngine;
+    private boolean isCj = false;
  
      public CjkBreakEngine(boolean korean) throws IOException {
          fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
@@ -47,9 +50,16 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
          if (korean) {
              setCharacters(fHangulWordSet);
          } else { //Chinese and Japanese
+            isCj = true;
              UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
              setCharacters(cjSet);
-            initializeJapanesePhraseParamater();
+            if (Boolean.parseBoolean(
+                    ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+                fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
+                        fClosePunctuationSet);
+            } else {
+                initializeJapanesePhraseParamater();
+            }
          }
      }
  
@@ -151,6 +161,15 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
                  charPositions[numCodePts] = index;
              }
          }
+        // Use ML phrase breaking
+        if (Boolean.parseBoolean(
+                ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+            // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
+            if (isPhraseBreaking && isCj) {
+                return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text,
+                        numCodePts, charPositions, foundBreaks);
+            }
+        }
  
          // From here on out, do the algorithm. Note that our indices
          // refer to indices within the normalized string.
@@ -276,10 +295,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
              // In phrase breaking, there has to be a breakpoint between Cj character and close
              // punctuation.
              // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
+            inText.setIndex(pos);
              if (pos > previous) {
                  if (pos != startPos
                          || (isPhraseBreaking && pos > 0
-                        && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
+                        && fClosePunctuationSet.contains(previous32(inText)))) {
                      foundBreaks.push(charPositions[t_boundary[i]] + startPos);
                      correctedNumBreaks++;
                  }
@@ -294,7 +314,9 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
              // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
              // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
              if (isPhraseBreaking) {
-                if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(inText.setIndex(endPos))) {
+                inText.setIndex(endPos);
+                int current = current32(inText);
+                if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
                      foundBreaks.pop();
                      correctedNumBreaks--;
                  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java

new file mode 100644 (file)

index 0000000..ceeb487
--- /dev/null
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java
@@ -0,0 +1,436 @@
+// © 2022 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+package com.ibm.icu.impl.breakiter;
+
+import static com.ibm.icu.impl.CharacterIteration.DONE32;
+import static com.ibm.icu.impl.CharacterIteration.current32;
+import static com.ibm.icu.impl.CharacterIteration.next32;
+import static com.ibm.icu.impl.CharacterIteration.previous32;
+
+import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.UResourceBundle;
+import com.ibm.icu.util.UResourceBundleIterator;
+
+import java.lang.System;
+import java.text.CharacterIterator;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class MlBreakEngine {
+
+    private static final int INVALID = '|';
+    private static final String INVALID_STRING = "|";
+    private static final int MAX_FEATURE = 26;
+    private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+    private UnicodeSet fClosePunctuationSet;
+    private HashMap<String, Integer> fModel;
+
+    private int fNegativeSum;
+
+    static class Element {
+        private int character;
+        private String ublock;
+
+        /**
+         * Default constructor.
+         */
+        public Element() {
+            character = 0;
+            ublock = null;
+        }
+
+        /**
+         * Set the character and its unicode block.
+         *
+         * @param ch  A unicode character.
+         * @param str The unicode block of the character.
+         */
+        public void setCharAndUblock(int ch, String str) {
+            Assert.assrt(str.length() <= 3);
+            this.character = ch;
+            ublock = str;
+        }
+
+        /**
+         * Get the unicode character.
+         *
+         * @return The unicode character.
+         */
+        public int getCharacter() {
+            return character;
+        }
+
+        /**
+         * Get the unicode character's unicode block.
+         *
+         * @return The unicode block.
+         */
+        public String getUblock() {
+            return ublock;
+        }
+    }
+
+    private static boolean isValid(Element element) {
+        String ublock = element.getUblock();
+        return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID;
+    }
+
+    /**
+     * Constructor for Chinese and Japanese phrase breaking.
+     *
+     * @param digitOrOpenPunctuationOrAlphabetSet An unicode set with the digit and open punctuation
+     *                                            and alphabet.
+     * @param closePunctuationSet                 An unicode set with the close punctuation.
+     */
+    public MlBreakEngine(UnicodeSet digitOrOpenPunctuationOrAlphabetSet,
+            UnicodeSet closePunctuationSet) {
+        fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet;
+        fClosePunctuationSet = closePunctuationSet;
+        fModel = new HashMap<String, Integer>();
+        fNegativeSum = 0;
+        loadMLModel();
+    }
+
+    /**
+     * Divide up a range of characters handled by this break engine.
+     *
+     * @param inText        A input text.
+     * @param startPos      The start index of the input text.
+     * @param endPos        The end index of the input text.
+     * @param inString      A input string normalized from inText from startPos to endPos
+     * @param numCodePts    The number of code points of inString
+     * @param charPositions A map that transforms inString's code point index to code unit index.
+     * @param foundBreaks   A list to store the breakpoint.
+     * @return The number of breakpoints
+     */
+    public int divideUpRange(CharacterIterator inText, int startPos, int endPos,
+            CharacterIterator inString, int numCodePts, int[] charPositions,
+            DictionaryBreakEngine.DequeI foundBreaks) {
+        if (startPos >= endPos) {
+            return 0;
+        }
+        ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
+        int ch;
+        String ublock;
+        // The ML model groups six char to evaluate if the 4th char is a breakpoint.
+        // Like a sliding window, the elementList removes the first char and appends the new char
+        // from inString in each iteration so that its size always remains at six.
+        Element elementList[] = new Element[6];
+        initElementList(inString, elementList, numCodePts);
+
+        // Add a break for the start.
+        boundary.add(0, 0);
+        for (int i = 1; i < numCodePts; i++) {
+            evaluateBreakpoint(elementList, i, boundary);
+            if (i + 1 > numCodePts) {
+                break;
+            }
+            shiftLeftOne(elementList);
+
+            ch = (i + 3) < numCodePts ? next32(inString) : INVALID;
+            ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING;
+            elementList[5].setCharAndUblock(ch, ublock);
+        }
+
+        // Add a break for the end if there is not one there already.
+        if (boundary.get(boundary.size() - 1) != numCodePts) {
+            boundary.add(numCodePts);
+        }
+
+        int correctedNumBreaks = 0;
+        int previous = -1;
+        int numBreaks = boundary.size();
+        for (int i = 0; i < numBreaks; i++) {
+            int pos = charPositions[boundary.get(i)] + startPos;
+            // In phrase breaking, there has to be a breakpoint between Cj character and close
+            // punctuation.
+            // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
+            inText.setIndex(pos);
+            if (pos > previous) {
+                if (pos != startPos
+                        || (pos > 0
+                        && fClosePunctuationSet.contains(previous32(inText)))) {
+                    foundBreaks.push(pos);
+                    correctedNumBreaks++;
+                }
+            }
+            previous = pos;
+        }
+
+        if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
+            // In phrase breaking, there has to be a breakpoint between Cj character and
+            // the number/open punctuation.
+            // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+            // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
+            // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
+            inText.setIndex(endPos);
+            int current = current32(inText);
+            if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
+                foundBreaks.pop();
+                correctedNumBreaks--;
+            }
+
+        }
+        if (!foundBreaks.isEmpty()) {
+            inText.setIndex(foundBreaks.peek());
+        }
+        return correctedNumBreaks;
+    }
+
+    private void shiftLeftOne(Element[] elementList) {
+        int length = elementList.length;
+        for (int i = 1; i < length; i++) {
+            elementList[i - 1].character = elementList[i].character;
+            elementList[i - 1].ublock = elementList[i].ublock;
+        }
+    }
+
+    /**
+     * Evaluate whether the index is a potential breakpoint.
+     *
+     * @param elementList A list including six elements for the breakpoint evaluation.
+     * @param index       The breakpoint index to be evaluated.
+     * @param boundary    An list including the index of the breakpoint.
+     */
+    private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) {
+        String[] featureList = new String[MAX_FEATURE];
+        final int w1 = elementList[0].getCharacter();
+        final int w2 = elementList[1].getCharacter();
+        final int w3 = elementList[2].getCharacter();
+        final int w4 = elementList[3].getCharacter();
+        final int w5 = elementList[4].getCharacter();
+        final int w6 = elementList[5].getCharacter();
+
+        StringBuilder sb = new StringBuilder();
+        int idx = 0;
+        if (w1 != INVALID) {
+            featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString();
+        }
+        if (w2 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString();
+        }
+        if (w3 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString();
+        }
+        if (w4 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString();
+        }
+        if (w5 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString();
+        }
+        if (w6 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString();
+        }
+        if (w2 != INVALID && w3 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint(
+                    w3).toString();
+        }
+        if (w3 != INVALID && w4 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint(
+                    w4).toString();
+        }
+        if (w4 != INVALID && w5 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint(
+                    w5).toString();
+        }
+        if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint(
+                    w2).appendCodePoint(w3).toString();
+        }
+        if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint(
+                    w3).appendCodePoint(w4).toString();
+        }
+        if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint(
+                    w4).appendCodePoint(w5).toString();
+        }
+        if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
+                    w5).appendCodePoint(w6).toString();
+        }
+        if (isValid(elementList[0])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString();
+        }
+        if (isValid(elementList[1])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString();
+        }
+        if (isValid(elementList[2])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString();
+        }
+        if (isValid(elementList[3])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString();
+        }
+        if (isValid(elementList[4])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString();
+        }
+        if (isValid(elementList[5])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString();
+        }
+        if (isValid(elementList[1]) && isValid(elementList[2])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BB1:").
+                    append(elementList[1].getUblock()).
+                    append(elementList[2].getUblock()).toString();
+        }
+        if (isValid(elementList[2]) && isValid(elementList[3])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BB2:").
+                    append(elementList[2].getUblock()).
+                    append(elementList[3].getUblock()).toString();
+        }
+        if (isValid(elementList[3]) && isValid(elementList[4])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("BB3:").
+                    append(elementList[3].getUblock()).
+                    append(elementList[4].getUblock()).toString();
+        }
+        if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TB1:").
+                    append(elementList[0].getUblock()).
+                    append(elementList[1].getUblock()).
+                    append(elementList[2].getUblock()).toString();
+        }
+        if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TB2:").
+                    append(elementList[1].getUblock()).
+                    append(elementList[2].getUblock()).
+                    append(elementList[3].getUblock()).toString();
+        }
+        if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TB3:").
+                    append(elementList[2].getUblock()).
+                    append(elementList[3].getUblock()).
+                    append(elementList[4].getUblock()).toString();
+        }
+        if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
+            sb.setLength(0);
+            featureList[idx++] = sb.append("TB4:").
+                    append(elementList[3].getUblock()).
+                    append(elementList[4].getUblock()).
+                    append(elementList[5].getUblock()).toString();
+        }
+        int score = fNegativeSum;
+        for (int j = 0; j < idx; j++) {
+            if (fModel.containsKey(featureList[j])) {
+                score += (2 * fModel.get(featureList[j]));
+            }
+        }
+        if (score > 0) {
+            boundary.add(index);
+        }
+    }
+
+    /**
+     * Initialize the element list from the input string.
+     *
+     * @param inString    A input string to be segmented.
+     * @param elementList A list to store the first six characters and their unicode block codes.
+     * @param numCodePts  The number of code points of input string
+     * @return The number of the code units of the first six characters in inString.
+     */
+    private int initElementList(CharacterIterator inString, Element[] elementList,
+            int numCodePts) {
+        int index = 0;
+        inString.setIndex(index);
+        int w1, w2, w3, w4, w5, w6;
+        w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+        if (numCodePts > 0) {
+            w3 = current32(inString);
+            index += Character.charCount(w3);
+        }
+        if (numCodePts > 1) {
+            w4 = next32(inString);
+            index += Character.charCount(w3);
+        }
+        if (numCodePts > 2) {
+            w5 = next32(inString);
+            index += Character.charCount(w5);
+        }
+        if (numCodePts > 3) {
+            w6 = next32(inString);
+            index += Character.charCount(w6);
+        }
+
+        final String b1 = INVALID_STRING;
+        final String b2 = b1;
+        final String b3 = getUnicodeBlock(w3);
+        final String b4 = getUnicodeBlock(w4);
+        final String b5 = getUnicodeBlock(w5);
+        final String b6 = getUnicodeBlock(w6);
+
+        elementList[0] = new Element();
+        elementList[0].setCharAndUblock(w1, b1);
+        elementList[1] = new Element();
+        elementList[1].setCharAndUblock(w2, b2);
+        elementList[2] = new Element();
+        elementList[2].setCharAndUblock(w3, b3);
+        elementList[3] = new Element();
+        elementList[3].setCharAndUblock(w4, b4);
+        elementList[4] = new Element();
+        elementList[4].setCharAndUblock(w5, b5);
+        elementList[5] = new Element();
+        elementList[5].setCharAndUblock(w6, b6);
+
+        return index;
+    }
+
+    /**
+     * Get the character's unicode block code defined in UBlockCode.
+     *
+     * @param ch A char.
+     * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
+     * is less than 3 digits.
+     */
+    private String getUnicodeBlock(int ch) {
+        int blockId = UCharacter.UnicodeBlock.of(ch).getID();
+        if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID()
+                || blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) {
+            return INVALID_STRING;
+        } else {
+            return String.format("%03d", blockId);
+        }
+    }
+
+    /**
+     * Load the machine learning's model file.
+     */
+    private void loadMLModel() {
+        int index = 0;
+        UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME,
+                "jaml");
+        UResourceBundle keyBundle = rb.get("modelKeys");
+        UResourceBundle valueBundle = rb.get("modelValues");
+        int[] value = valueBundle.getIntVector();
+        UResourceBundleIterator iterator = keyBundle.getIterator();
+        while (iterator.hasNext()) {
+            fNegativeSum -= value[index];
+            fModel.put(iterator.nextString(), value[index++]);
+        }
+    }
+}
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java

index 4bea6cc0ec886fda3cd1f4b98f1875eb6dff1ff5..427955bfab7dc9a08d237c237271099aa1efe774 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@@ -20,6 +20,7 @@ import org.junit.runners.JUnit4;
  
  import com.ibm.icu.dev.test.TestFmwk;
  import com.ibm.icu.dev.test.TestUtil;
+import com.ibm.icu.impl.ICUConfig;
  import com.ibm.icu.impl.Utility;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.text.BreakIterator;
@@ -124,6 +125,7 @@ public void TestExtended() {
      int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
  
      int    len = testString.length();
+    boolean skipTest = false;
  
      for (charIdx = 0; charIdx < len; ) {
          int c = testString.codePointAt(charIdx);
@@ -157,6 +159,7 @@ public void TestExtended() {
                  break;
              }
             if (testString.startsWith("<word>", charIdx-1)) {
+                skipTest = false;
                  tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
                  charIdx += 5;
                  break;
@@ -167,22 +170,46 @@ public void TestExtended() {
                  break;
              }
              if (testString.startsWith("<line>", charIdx-1)) {
+                skipTest = false;
                  tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
+                if (Boolean.parseBoolean(
+                        ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+                    if (tp.currentLocale.getName().equals("ja@lw=phrase")) {
+                        // skip <line> test cases of JP's phrase breaking when ML is enabled.
+                        skipTest = true;
+                    }
+                }
                  charIdx += 5;
                  break;
              }
+            if (testString.startsWith("<lineML>", charIdx-1)) {
+                skipTest = false;
+                tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
+                if (!Boolean.parseBoolean(
+                        ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
+                    if (tp.currentLocale.getName().equals("ja@lw=phrase")) {
+                        // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
+                        skipTest = true;
+                    }
+                }
+                charIdx += 7;
+                break;
+            }
              if (testString.startsWith("<sent>", charIdx-1)) {
+                skipTest = false;
                  tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
                  charIdx += 5;
                  break;
              }
              if (testString.startsWith("<title>", charIdx-1)) {
+                skipTest = false;
                  tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
                  charIdx += 6;
                  break;
              }
              if (testString.startsWith("<rules>", charIdx-1) ||
                      testString.startsWith("<badrules>", charIdx-1)) {
+                skipTest = false;
                  charIdx = testString.indexOf('>', charIdx) + 1;
                  parseState = PARSE_RULES;
                  rules.setLength(0);
@@ -272,7 +299,9 @@ public void TestExtended() {
                  charIdx += 6;
  
                  // RUN THE TEST!
-                executeTest(tp);
+                if (!skipTest) {
+                    executeTest(tp);
+                }
                  break;
              }
  
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 72bd15803d6854839c16517cde04725ff244e8c2..40c6745dd06666d0b0e3328f15e43a98172ab9af 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1913,6 +1913,26 @@ Bangkok)•</data>
  <data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
  
  <locale ja@lw=phrase>
+#phrase breaking test cases for the ML solution
+<lineML>
+#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
+#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
+#𛁈る𛀸（しるこ）、あ𛀙よろし（あかよろし） -> 𛁈る𛀸•（しるこ）、•あ𛀙よろし•（あかよろし）
+<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#中国の携帯は約５００元から５０００元です -> 中国の▁携帯は▁約▁５００元から▁５０００元です
+<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
+#しかもロゴがＵｎｉｃｏｄｅ！！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！！
+<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
+#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
+<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
+#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
+<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
+
+<locale ja@lw=phrase>
+#phrase breaking test cases for the dictionary based solution
  <line>
  #[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
  <data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
@@ -2005,8 +2025,8 @@ Bangkok)•</data>
  #大韓民國은 民主共和國이다
  #<data>•大韓民國은 •民主•共和國이다•</data>
  # All the tests for ja@lw=phrase should also work in Korean.
-#[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
-<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
  #９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
  <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
author	allenwtsu <allenwtsu@google.com>
	Tue, 20 Dec 2022 16:34:42 +0000 (16:34 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 20 Dec 2022 22:27:04 +0000 (14:27 -0800)
.github/workflows/icu_ci.yml		patch \| blob \| history
icu4j/build.xml		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java	[new file with mode: 0644]	patch \| blob
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history