}
public BurmeseBreakEngine() throws IOException {
- super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fBurmeseWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Mymr");
private DictionaryMatcher fDictionary = null;
public CjkBreakEngine(boolean korean) throws IOException {
- super(BreakIterator.KIND_WORD);
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
setCharacters(fHangulWordSet);
}
UnicodeSet fSet = new UnicodeSet();
- private BitSet fTypes = new BitSet(32);
/**
- * @param breakTypes The types of break iterators that can use this engine.
- * For example, BreakIterator.KIND_LINE
+ * Constructor
*/
- public DictionaryBreakEngine(Integer... breakTypes) {
- for (Integer type: breakTypes) {
- fTypes.set(type);
- }
+ public DictionaryBreakEngine() {
}
@Override
import com.ibm.icu.lang.UScript;
class KhmerBreakEngine extends DictionaryBreakEngine {
-
+
// Constants for KhmerBreakIterator
// How many words in a row are "good enough"?
private static final byte KHMER_LOOKAHEAD = 3;
private static final byte KHMER_MIN_WORD = 2;
// Minimum number of characters for two words
private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
-
-
+
+
private DictionaryMatcher fDictionary;
private static UnicodeSet fKhmerWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
-
+
static {
// Initialize UnicodeSets
fKhmerWordSet = new UnicodeSet();
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
-
+
// Freeze the static UnicodeSet
fKhmerWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
-
+
public KhmerBreakEngine() throws IOException {
- super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fKhmerWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Khmr");
}
+ @Override
public boolean equals(Object obj) {
// Normally is a singleton, but it's possible to have duplicates
// during initialization. All are equivalent.
return obj instanceof KhmerBreakEngine;
}
+ @Override
public int hashCode() {
return getClass().hashCode();
}
-
- public boolean handles(int c, int breakType) {
- if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
- int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
- return (script == UScript.KHMER);
- }
- return false;
+
+ @Override
+ public boolean handles(int c) {
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ return (script == UScript.KHMER);
}
- public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
+ @Override
+ public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
-
+
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
return 0; // Not enough characters for word
}
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
- (wordLength == 0 ||
+ (wordLength == 0 ||
words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int remaining = rangeEnd - (current + wordLength);
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
- // resynch continues to function. For example, one of the suffix characters
+ // resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
// NOT CURRENTLY APPLICABLE TO KHMER
}
public LaoBreakEngine() throws IOException {
- super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fLaoWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
import com.ibm.icu.lang.UScript;
class ThaiBreakEngine extends DictionaryBreakEngine {
-
+
// Constants for ThaiBreakIterator
// How many words in a row are "good enough"?
private static final byte THAI_LOOKAHEAD = 3;
private static final byte THAI_MIN_WORD = 2;
// Minimum number of characters for two words
private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
-
+
private DictionaryMatcher fDictionary;
private static UnicodeSet fThaiWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fSuffixSet;
private static UnicodeSet fMarkSet;
-
+
static {
// Initialize UnicodeSets
fThaiWordSet = new UnicodeSet();
fEndWordSet.compact();
fBeginWordSet.compact();
fSuffixSet.compact();
-
+
// Freeze the static UnicodeSet
fThaiWordSet.freeze();
fMarkSet.freeze();
fBeginWordSet.freeze();
fSuffixSet.freeze();
}
-
+
public ThaiBreakEngine() throws IOException {
- super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fThaiWordSet);
// Initialize dictionary
fDictionary = DictionaryData.loadDictionaryFor("Thai");
}
-
+
+ @Override
public boolean equals(Object obj) {
// Normally is a singleton, but it's possible to have duplicates
// during initialization. All are equivalent.
return obj instanceof ThaiBreakEngine;
}
+ @Override
public int hashCode() {
return getClass().hashCode();
}
-
- public boolean handles(int c, int breakType) {
- if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
- int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
- return (script == UScript.THAI);
- }
- return false;
+
+ @Override
+ public boolean handles(int c) {
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ return (script == UScript.THAI);
}
+ @Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
words[i] = new PossibleWord();
}
-
+
int uc;
fIter.setIndex(rangeStart);
int current;
}
} while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
}
- }
+ }
while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
// foundBest: end of loop
}
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
- (wordLength == 0 ||
+ (wordLength == 0 ||
words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int remaining = rangeEnd - (current + wordLength);
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
- // resynch continues to function. For example, one of the suffix characters
+ // resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
if (fIter.getIndex() < rangeEnd && wordLength > 0) {
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&