@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
import java.io.IOException;
import java.text.CharacterIterator;
+import java.util.HashSet;
import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUData;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.UResourceBundle;
+import com.ibm.icu.util.UResourceBundleIterator;
public class CjkBreakEngine extends DictionaryBreakEngine {
private UnicodeSet fHangulWordSet;
+ private UnicodeSet fNumberOrOpenPunctuationSet;
+ private UnicodeSet fClosePunctuationSet;
private DictionaryMatcher fDictionary = null;
+ private HashSet<String> fSkipSet;
public CjkBreakEngine(boolean korean) throws IOException {
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
fHangulWordSet.freeze();
+ fNumberOrOpenPunctuationSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:]]");
+ fNumberOrOpenPunctuationSet.freeze();
+ fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]");
+ fClosePunctuationSet.freeze();
+ fSkipSet = new HashSet<String>();
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
} else { //Chinese and Japanese
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
setCharacters(cjSet);
+ initializeJapanesePhraseParamater();
+ }
+ }
+
+ private void initializeJapanesePhraseParamater() {
+ loadJapaneseParticleAndAuxVerbs();
+ loadHiragana();
+ }
+
+ private void loadJapaneseParticleAndAuxVerbs() {
+ UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja");
+ final String[] tags = {"particles", "auxVerbs"};
+ for (String tag : tags) {
+ UResourceBundle bundle = rb.get(tag);
+ UResourceBundleIterator iterator = bundle.getIterator();
+ while (iterator.hasNext()) {
+ fSkipSet.add(iterator.nextString());
+ }
+ }
+ }
+
+ private void loadHiragana() {
+ UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]");
+ hiraganaWordSet.freeze();
+ UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet);
+ while (iterator.next()) {
+ fSkipSet.add(iterator.getString());
}
}
@Override
public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
if (startPos >= endPos) {
return 0;
}
if (bestSnlp[numCodePts] == kint32max) {
t_boundary[numBreaks] = numCodePts;
numBreaks++;
+ } else if (isPhraseBreaking) {
+ t_boundary[numBreaks] = numCodePts;
+ numBreaks++;
+ int prevIdx = numCodePts;
+ int codeUnitIdx = 0, length = 0;
+ for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
+ codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
+ length = prevIdx - i;
+ prevIdx = i;
+ String pattern = getPatternFromText(text, s, codeUnitIdx, length);
+ // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+ // characters don't occur.
+ text.setIndex(codeUnitIdx - 1);
+ if (!fSkipSet.contains(pattern)
+ && (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
+ t_boundary[numBreaks] = i;
+ numBreaks++;
+ }
+ }
} else {
for (int i = numCodePts; i > 0; i = prev[i]) {
t_boundary[numBreaks] = i;
int previous = -1;
for (int i = numBreaks - 1; i >= 0; i--) {
int pos = charPositions[t_boundary[i]] + startPos;
- if (pos > previous && pos != startPos) {
- foundBreaks.push(pos);
- correctedNumBreaks++;
+ // In phrase breaking, there has to be a breakpoint between Cj character and close
+ // punctuation.
+ // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
+ if (pos > previous) {
+ if (pos != startPos
+ || (isPhraseBreaking && pos > 0
+ && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
+ foundBreaks.push(charPositions[t_boundary[i]] + startPos);
+ correctedNumBreaks++;
+ }
}
previous = pos;
}
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
- foundBreaks.pop();
- correctedNumBreaks--;
+ // In phrase breaking, there has to be a breakpoint between Cj character and
+ // the number/open punctuation.
+ // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+ // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だ▁ろうか -> breakpoint between 率 and 9
+ if (isPhraseBreaking) {
+ if (!fNumberOrOpenPunctuationSet.contains(inText.setIndex(endPos))) {
+ foundBreaks.pop();
+ correctedNumBreaks--;
+ }
+ } else {
+ foundBreaks.pop();
+ correctedNumBreaks--;
+ }
}
if (!foundBreaks.isEmpty())
inText.setIndex(foundBreaks.peek());
return correctedNumBreaks;
}
+
+ private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
+ int length) {
+ sb.setLength(0);
+ if(length > 0) {
+ text.setIndex(start);
+ sb.appendCodePoint(current32(text));
+ for (int j = 1; j < length; j++) {
+ sb.appendCodePoint(next32(text));
+ }
+ }
+ return sb.toString();
+ }
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
int result = 0;
// Find the span of characters included in the set.
rangeStart = start;
rangeEnd = current;
- result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
+ result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking);
text.setIndex(current);
return result;
abstract int divideUpDictionaryRange(CharacterIterator text,
int rangeStart,
int rangeEnd,
- DequeI foundBreaks );
+ DequeI foundBreaks,
+ boolean isPhraseBreaking);
}
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
return 0; // Not enough characters for word
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
int beginSize = foundBreaks.size();
if ((rangeEnd - rangeStart) < MIN_WORD_SPAN) {
* @return the number of breaks found
*/
int findBreaks(CharacterIterator text, int startPos, int endPos,
- DictionaryBreakEngine.DequeI foundBreaks);
+ DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking);
}
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
- DequeI foundBreaks) {
+ DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
return 0; // Not enough characters for word
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
- DictionaryBreakEngine.DequeI foundBreaks) {
+ DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
UnicodeSet uniset = fHandled;
int c = CharacterIteration.current32(text);
// Get the binary rules.
//
ByteBuffer bytes = null;
- String typeKeyExt = null;
+ String typeKeyExt = "";
if (kind == BreakIterator.KIND_LINE) {
- String lbKeyValue = locale.getKeywordValue("lb");
- if ( lbKeyValue != null && (lbKeyValue.equals("strict") || lbKeyValue.equals("normal") || lbKeyValue.equals("loose")) ) {
- typeKeyExt = "_" + lbKeyValue;
+ String keyValue = locale.getKeywordValue("lb");
+ if ( keyValue != null && (keyValue.equals("strict") || keyValue.equals("normal") || keyValue.equals("loose")) ) {
+ typeKeyExt = "_" + keyValue;
+ }
+ String language = locale.getLanguage();
+ if (language != null && language.equals("ja")) {
+ keyValue = locale.getKeywordValue("lw");
+ if (keyValue != null && keyValue.equals("phrase")) {
+ typeKeyExt += "_" + keyValue;
+ }
}
}
+ String brkfname;
try {
- String typeKey = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt;
- String brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
+ String typeKey = typeKeyExt.isEmpty() ? KIND_NAMES[kind] : KIND_NAMES[kind] + typeKeyExt;
+ brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
String rulesFileName = ICUData.ICU_BRKITR_NAME+ '/' + brkfname;
bytes = ICUBinary.getData(rulesFileName);
}
// Create a normal RuleBasedBreakIterator.
//
try {
- iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes);
+ boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase");
+ iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking);
}
catch (IOException e) {
// Shouldn't be possible to get here.
return This;
}
+ /**
+ * This factory method doesn't have an access modifier; it is only accessible in the same
+ * package.
+ *
+ * Create a break iterator from a precompiled set of break rules.
+ *
+ * Creating a break iterator from the binary rules is much faster than
+ * creating one from source rules.
+ *
+ * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
+ * Binary break iterator rules are not guaranteed to be compatible between
+ * different versions of ICU.
+ *
+ * @param bytes a buffer supplying the compiled binary rules.
+ * @param phraseBreaking a flag indicating if phrase breaking is required.
+ * @throws IOException if there is an error while reading the rules from the buffer.
+ * @see #compileRules(String, OutputStream)
+ * @internal
+ */
+ /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
+ ByteBuffer bytes, boolean phraseBreaking) throws IOException {
+ RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
+ instance.fPhraseBreaking = phraseBreaking;
+ return instance;
+ }
+
/**
* Create a break iterator from a precompiled set of break rules.
*
*/
private BreakCache fBreakCache = new BreakCache();
+ /**
+ * Flag used to indicate if phrase breaking is required.
+ */
+ private boolean fPhraseBreaking = false;
+
/**
* Counter for the number of characters encountered with the "dictionary"
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
- foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
+ foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
}
// Reload the loop variables for the next go-round
version https://git-lfs.github.com/spec/v1
-oid sha256:44951f88294c06e433a3b61238d9bb5f59ba01f091fcfb8fe4966f98f0748ef7
-size 13627084
+oid sha256:65125c8b8176c083a7597fed4c895fa263a185593bda5309753b95e8a5ec0dda
+size 13650605
version https://git-lfs.github.com/spec/v1
-oid sha256:d13d3b8e7c58f0e41e4b6ff6f2bfa43529de382ecf2c1e3944429b1c1a761361
-size 96439
+oid sha256:31a470c8a209305fd98faf5ed0f20bf79cf57cfcb2281041b20d98ad742c7b5e
+size 96440
version https://git-lfs.github.com/spec/v1
-oid sha256:cf33f21346eea88c0282a4960f19f27e475554449f52ef4f25889e2b8a34a1c0
-size 826063
+oid sha256:2c951a44c5d9726ea4532cb840309d8503c380094b7fd0e56b96094187ce0a24
+size 826064
int length = fields[1].length();
CharacterIterator input = new StringCharacterIterator(fields[1]);
DictionaryBreakEngine.DequeI foundBreaks = new DictionaryBreakEngine.DequeI();
- int ret = engine.findBreaks(input, 0, length, foundBreaks);
+ int ret = engine.findBreaks(input, 0, length, foundBreaks, false);
StringBuilder sb = new StringBuilder();
sb.append('{');
for (int i = 0; i < foundBreaks.size(); i++) {
# woman astronaut, woman astronaut / fitz4
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
+<locale ja@lw=phrase>
+<line>
+#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
+<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
+<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
+#乗車率90%程度だろうか。 -> 乗車•率•90%•程度だ•ろうか。•
+<data>•\u4e57\u8eca•\u7387•\uff19\uff10\uff05•\u7a0b\u5ea6\u3060•\u308d\u3046\u304b\u3002•</data>
+#[携帯電話]正しい選択 -> [携帯•電話]•正しい•選択•
+<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
+#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
+<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
+
####################################################################################
#