]> granicus.if.org Git - icu/commitdiff
ICU-21699 Phrase based breaking(Java)
authorallenwtsu <allenwtsu@google.com>
Mon, 27 Dec 2021 04:20:19 +0000 (04:20 +0000)
committerFrank Yung-Fong Tang <ftang@google.com>
Fri, 21 Jan 2022 21:11:59 +0000 (13:11 -0800)
See #1955

16 files changed:
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/DictionaryBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LSTMBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LanguageBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/UnhandledBreakEngine.java
icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
icu4j/main/shared/data/icudata.jar
icu4j/main/shared/data/icutzdata.jar
icu4j/main/shared/data/testdata.jar
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/LSTMBreakEngineTest.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index e9f0299c765f730cb765dbbefce4b34f86fe932b..12352e4dc4798ba8a69888a0d8feedab6daa8b6c 100644 (file)
@@ -77,7 +77,7 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
 
 
         if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
index 0404e031cc2367b7eea27c7d730d4269bea2ab30..06b93683771304fd32537bf3baf900c4d9c117bd 100644 (file)
@@ -14,18 +14,31 @@ import static com.ibm.icu.impl.CharacterIteration.next32;
 
 import java.io.IOException;
 import java.text.CharacterIterator;
+import java.util.HashSet;
 
 import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUData;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.UResourceBundle;
+import com.ibm.icu.util.UResourceBundleIterator;
 
 public class CjkBreakEngine extends DictionaryBreakEngine {
     private UnicodeSet fHangulWordSet;
+    private UnicodeSet fNumberOrOpenPunctuationSet;
+    private UnicodeSet fClosePunctuationSet;
     private DictionaryMatcher fDictionary = null;
+    private HashSet<String> fSkipSet;
 
     public CjkBreakEngine(boolean korean) throws IOException {
         fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
         fHangulWordSet.freeze();
+        fNumberOrOpenPunctuationSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:]]");
+        fNumberOrOpenPunctuationSet.freeze();
+        fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]");
+        fClosePunctuationSet.freeze();
+        fSkipSet = new HashSet<String>();
 
         fDictionary = DictionaryData.loadDictionaryFor("Hira");
         if (korean) {
@@ -33,6 +46,33 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
         } else { //Chinese and Japanese
             UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
             setCharacters(cjSet);
+            initializeJapanesePhraseParamater();
+        }
+    }
+
+    private void initializeJapanesePhraseParamater() {
+        loadJapaneseParticleAndAuxVerbs();
+        loadHiragana();
+    }
+
+    private void loadJapaneseParticleAndAuxVerbs() {
+        UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja");
+        final String[] tags = {"particles", "auxVerbs"};
+        for (String tag : tags) {
+            UResourceBundle bundle = rb.get(tag);
+            UResourceBundleIterator iterator = bundle.getIterator();
+            while (iterator.hasNext()) {
+                fSkipSet.add(iterator.nextString());
+            }
+        }
+    }
+
+    private void loadHiragana() {
+        UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]");
+        hiraganaWordSet.freeze();
+        UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet);
+        while (iterator.next()) {
+            fSkipSet.add(iterator.getString());
         }
     }
 
@@ -66,7 +106,7 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
         if (startPos >= endPos) {
             return 0;
         }
@@ -196,6 +236,25 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
         if (bestSnlp[numCodePts] == kint32max) {
             t_boundary[numBreaks] = numCodePts;
             numBreaks++;
+        } else if (isPhraseBreaking) {
+            t_boundary[numBreaks] = numCodePts;
+            numBreaks++;
+            int prevIdx = numCodePts;
+            int codeUnitIdx = 0, length = 0;
+            for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
+                codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
+                length = prevIdx - i;
+                prevIdx = i;
+                String pattern = getPatternFromText(text, s, codeUnitIdx, length);
+                // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+                // characters don't occur.
+                text.setIndex(codeUnitIdx - 1);
+                if (!fSkipSet.contains(pattern)
+                        && (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
+                    t_boundary[numBreaks] = i;
+                    numBreaks++;
+                }
+            }
         } else {
             for (int i = numCodePts; i > 0; i = prev[i]) {
                 t_boundary[numBreaks] = i;
@@ -212,19 +271,50 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
         int previous = -1;
         for (int i = numBreaks - 1; i >= 0; i--) {
             int pos = charPositions[t_boundary[i]] + startPos;
-            if (pos > previous && pos != startPos) {
-                foundBreaks.push(pos);
-                correctedNumBreaks++;
+            // In phrase breaking, there has to be a breakpoint between Cj character and close
+            // punctuation.
+            // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
+            if (pos > previous) {
+                if (pos != startPos
+                        || (isPhraseBreaking && pos > 0
+                        && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
+                    foundBreaks.push(charPositions[t_boundary[i]] + startPos);
+                    correctedNumBreaks++;
+                }
             }
             previous = pos;
         }
 
         if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
-            foundBreaks.pop();
-            correctedNumBreaks--;
+            // In phrase breaking, there has to be a breakpoint between Cj character and
+            // the number/open punctuation.
+            // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+            // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だ▁ろうか -> breakpoint between 率 and 9
+            if (isPhraseBreaking) {
+                if (!fNumberOrOpenPunctuationSet.contains(inText.setIndex(endPos))) {
+                    foundBreaks.pop();
+                    correctedNumBreaks--;
+                }
+            } else {
+                foundBreaks.pop();
+                correctedNumBreaks--;
+            }
         }
         if (!foundBreaks.isEmpty())
             inText.setIndex(foundBreaks.peek());
         return correctedNumBreaks;
     }
+
+    private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
+            int length) {
+        sb.setLength(0);
+        if(length > 0) {
+            text.setIndex(start);
+            sb.appendCodePoint(current32(text));
+            for (int j = 1; j < length; j++) {
+                sb.appendCodePoint(next32(text));
+            }
+        }
+        return sb.toString();
+    }
 }
index 208d1de44c3f6371f68257ee4c6ac8861db6e662..443badcdb72a8ecaee66357c54ef591b597c716e 100644 (file)
@@ -183,7 +183,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
 
     @Override
     public int findBreaks(CharacterIterator text, int startPos, int endPos,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
         int result = 0;
 
          // Find the span of characters included in the set.
@@ -202,7 +202,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
         rangeStart = start;
         rangeEnd = current;
 
-        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
+        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking);
         text.setIndex(current);
 
         return result;
@@ -226,5 +226,6 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
      abstract int divideUpDictionaryRange(CharacterIterator text,
                                           int               rangeStart,
                                           int               rangeEnd,
-                                          DequeI            foundBreaks );
+                                          DequeI            foundBreaks,
+                                          boolean isPhraseBreaking);
 }
index 02401f8ed67884728331c923630e8c6ecd622ab0..bd3fa9f2ded98656c47806d67865c71ef0deefd3 100644 (file)
@@ -85,7 +85,7 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
 
         if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
             return 0;  // Not enough characters for word
index 7028a86559837035817ec61ac016a61d5993f051..267ada824ac90e3c90bb4e1ad5da46e2b0a7fe60 100644 (file)
@@ -343,7 +343,7 @@ public class LSTMBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
         int beginSize = foundBreaks.size();
 
         if ((rangeEnd - rangeStart) < MIN_WORD_SPAN) {
index ede94b76dad38b55655b5bcb5719117251a0fd9d..ca8e8e6c57e8a5bd405ec5cfb9bb9b9817b39857 100644 (file)
@@ -32,7 +32,7 @@ public interface LanguageBreakEngine {
      * @return the number of breaks found
      */
     int findBreaks(CharacterIterator text, int startPos, int endPos,
-            DictionaryBreakEngine.DequeI foundBreaks);
+            DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking);
 }
 
 
index 95a8ef3762ee39a61a8125facb52bf472079d4b7..e60271aa5a4390f28065951131549eeff0c51d12 100644 (file)
@@ -85,7 +85,7 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
 
 
         if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
index 71ba5096e5a076e721d445b290e3f60eac299a83..3f96705109b0e76a366110ae02ddb5cbf2576849 100644 (file)
@@ -96,7 +96,7 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
 
     @Override
     public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
-            DequeI foundBreaks) {
+            DequeI foundBreaks, boolean isPhraseBreaking) {
 
         if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
             return 0;  // Not enough characters for word
index b00cca0815b24108034b28a27becef7df54d8bdf..cd5fb092e452f79fa891c54ab69e5b3f32e74bef 100644 (file)
@@ -44,7 +44,7 @@ public final class UnhandledBreakEngine implements LanguageBreakEngine {
 
     @Override
     public int findBreaks(CharacterIterator text, int startPos, int endPos,
-            DictionaryBreakEngine.DequeI foundBreaks) {
+            DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
 
         UnicodeSet uniset = fHandled;
         int c = CharacterIteration.current32(text);
index 2594c1b134742b2271a0d836feebd3df2a58410e..3de520597aa7d0e796ccbcc2cf3282670139801e 100644 (file)
@@ -129,17 +129,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
         //  Get the binary rules.
         //
         ByteBuffer bytes = null;
-        String typeKeyExt = null;
+        String typeKeyExt = "";
         if (kind == BreakIterator.KIND_LINE) {
-            String lbKeyValue = locale.getKeywordValue("lb");
-            if ( lbKeyValue != null && (lbKeyValue.equals("strict") || lbKeyValue.equals("normal") || lbKeyValue.equals("loose")) ) {
-                typeKeyExt = "_" + lbKeyValue;
+            String keyValue = locale.getKeywordValue("lb");
+            if ( keyValue != null && (keyValue.equals("strict") || keyValue.equals("normal") || keyValue.equals("loose")) ) {
+                typeKeyExt = "_" + keyValue;
+            }
+            String language = locale.getLanguage();
+            if (language != null && language.equals("ja")) {
+                keyValue = locale.getKeywordValue("lw");
+                if (keyValue != null && keyValue.equals("phrase")) {
+                    typeKeyExt += "_" + keyValue;
+                }
             }
         }
 
+        String brkfname;
         try {
-            String         typeKey       = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt;
-            String         brkfname      = rb.getStringWithFallback("boundaries/" + typeKey);
+            String         typeKey       = typeKeyExt.isEmpty() ? KIND_NAMES[kind] : KIND_NAMES[kind] + typeKeyExt;
+                           brkfname      = rb.getStringWithFallback("boundaries/" + typeKey);
             String         rulesFileName = ICUData.ICU_BRKITR_NAME+ '/' + brkfname;
                            bytes         = ICUBinary.getData(rulesFileName);
         }
@@ -151,7 +159,8 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
         // Create a normal RuleBasedBreakIterator.
         //
         try {
-            iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes);
+            boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase");
+            iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking);
         }
         catch (IOException e) {
             // Shouldn't be possible to get here.
index 6bf2a26413606c0c43a419f9f042c4333a89ad58..507677579fcf14108a8fbf237e39fc301ef667d7 100644 (file)
@@ -84,6 +84,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
         return This;
     }
 
+    /**
+     * This factory method doesn't have an access modifier; it is only accessible in the same
+     * package.
+     *
+     * Create a break iterator from a precompiled set of break rules.
+     *
+     * Creating a break iterator from the binary rules is much faster than
+     * creating one from source rules.
+     *
+     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
+     * Binary break iterator rules are not guaranteed to be compatible between
+     * different versions of ICU.
+     *
+     * @param bytes a buffer supplying the compiled binary rules.
+     * @param phraseBreaking a flag indicating if phrase breaking is required.
+     * @throws IOException if there is an error while reading the rules from the buffer.
+     * @see    #compileRules(String, OutputStream)
+     * @internal
+     */
+    /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
+            ByteBuffer bytes, boolean phraseBreaking) throws IOException {
+        RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
+        instance.fPhraseBreaking = phraseBreaking;
+        return instance;
+    }
+
     /**
      * Create a break iterator from a precompiled set of break rules.
      *
@@ -274,6 +300,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
      */
     private BreakCache         fBreakCache = new BreakCache();
 
+    /**
+     * Flag used to indicate if phrase breaking is required.
+     */
+    private boolean            fPhraseBreaking = false;
+
 
     /**
      * Counter for the number of characters encountered with the "dictionary"
@@ -1205,7 +1236,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
                 // Ask the language object if there are any breaks. It will add them to the cache and
                 // leave the text pointer on the other side of its range, ready to search for the next one.
                 if (lbe != null) {
-                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
+                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
                 }
 
                 // Reload the loop variables for the next go-round
index 84083a72a13ed6e6567134bc202ef4fbf9b9eb7c..2840f866456c3b27f49064ddf8ff70ad51cb58f7 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44951f88294c06e433a3b61238d9bb5f59ba01f091fcfb8fe4966f98f0748ef7
-size 13627084
+oid sha256:65125c8b8176c083a7597fed4c895fa263a185593bda5309753b95e8a5ec0dda
+size 13650605
index b4c58beb7abf8c053dee449ac21e293caa876219..69bf00a16b8a546af504176d19971d1a89e87ee7 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d13d3b8e7c58f0e41e4b6ff6f2bfa43529de382ecf2c1e3944429b1c1a761361
-size 96439
+oid sha256:31a470c8a209305fd98faf5ed0f20bf79cf57cfcb2281041b20d98ad742c7b5e
+size 96440
index 8dc53bbbf05b6c776d57e9feb5939ccfde0988ed..4728fd9d4c255613fa33449c4261bf34f5e7480a 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf33f21346eea88c0282a4960f19f27e475554449f52ef4f25889e2b8a34a1c0
-size 826063
+oid sha256:2c951a44c5d9726ea4532cb840309d8503c380094b7fd0e56b96094187ce0a24
+size 826064
index 8d248a551a2c35fda3468b946c6035c89f4c02e4..1cedfd8e91d25d217da75bb22fbb7ff6dc578209 100644 (file)
@@ -80,7 +80,7 @@ public class LSTMBreakEngineTest extends TestFmwk {
                     int length = fields[1].length();
                     CharacterIterator input = new StringCharacterIterator(fields[1]);
                     DictionaryBreakEngine.DequeI foundBreaks = new DictionaryBreakEngine.DequeI();
-                    int ret = engine.findBreaks(input, 0, length, foundBreaks);
+                    int ret = engine.findBreaks(input, 0, length, foundBreaks, false);
                     StringBuilder sb = new StringBuilder();
                     sb.append('{');
                     for (int i = 0; i < foundBreaks.size(); i++) {
index 1948360277d03048c7bb6405c8121ce76dcf8cb9..346da988d7dd0b4baf4e78506f68f977013158c5 100644 (file)
@@ -1884,6 +1884,21 @@ Bangkok)•</data>
 # woman astronaut, woman astronaut / fitz4
 <data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
 
+<locale ja@lw=phrase>
+<line>
+#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
+<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
+#乗車率90%程度だろうか。 -> 乗車•率•90%•程度だ•ろうか。•
+<data>•\u4e57\u8eca•\u7387•\uff19\uff10\uff05•\u7a0b\u5ea6\u3060•\u308d\u3046\u304b\u3002•</data>
+#[携帯電話]正しい選択 -> [携帯•電話]•正しい•選択•
+<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
+#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
+<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
+
 
 ####################################################################################
 #