<include name="**/pnames.icu"/>
<include name="**/*.res"/>
<include name="**/*.brk"/>
- <include name="**/*.ctd"/>
+ <include name="**/*.dict"/>
<include name="**/*.nrm"/>
<exclude name="**/coll/*.res"/>
<exclude name="**/translit/*.res"/>
<include name="**/unames.icu"/>
<include name="**/pnames.icu"/>
<include name="**/*.brk"/>
- <include name="**/*.ctd"/>
+ <include name="**/*.dict"/>
<include name="**/*.nrm"/>
<include name="**/brkitr/*.res"/>
<include name="**/translit/*.res"/>
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.impl;
+import java.text.CharacterIterator;
+import com.ibm.icu.text.UTF16;
+public final class CharacterIteration {
+ // disallow instantiation
+ private CharacterIteration() { }
+ // 32 bit Char value returned from when an iterator has run out of range.
+ // Positive value so fast case (not end, not surrogate) can be checked
+ // with a single test.
+ public static int DONE32 = 0x7fffffff;
+ /**
+ * Move the iterator forward to the next code point, and return that code point,
+ * leaving the iterator positioned at char returned.
+ * For Supplementary chars, the iterator is left positioned at the lead surrogate.
+ * @param ci The character iterator
+ * @return The next code point.
+ */
+ public static int next32(CharacterIterator ci) {
+ // If the current position is at a surrogate pair, move to the trail surrogate
+ // which leaves it in positon for underlying iterator's next() to work.
+ int c= ci.current();
+ c = ci.next();
+ c = ci.previous();
+ }
+ }
+ // For BMP chars, this next() is the real deal.
+ c = ci.next();
+ // If we might have a lead surrogate, we need to peak ahead to get the trail
+ // even though we don't want to really be positioned there.
+ c = nextTrail32(ci, c);
+ }
+ if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
+ // We got a supplementary char. Back the iterator up to the postion
+ // of the lead surrogate.
+ ci.previous();
+ }
+ return c;
+ }
+ // Out-of-line portion of the in-line Next32 code.
+ // The call site does an initial ci.next() and calls this function
+ // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
+ // NOTE: we leave the underlying char iterator positioned in the
+ // middle of a surroage pair. ci.next() will work correctly
+ // from there, but the ci.getIndex() will be wrong, and needs
+ // adjustment.
+ public static int nextTrail32(CharacterIterator ci, int lead) {
+ int retVal = lead;
+ char cTrail = ci.next();
+ if (UTF16.isTrailSurrogate(cTrail)) {
+ retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+ } else {
+ ci.previous();
+ }
+ } else {
+ if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
+ retVal = DONE32;
+ }
+ }
+ return retVal;
+ }
+ public static int previous32(CharacterIterator ci) {
+ if (ci.getIndex() <= ci.getBeginIndex()) {
+ return DONE32;
+ }
+ char trail = ci.previous();
+ int retVal = trail;
+ if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
+ char lead = ci.previous();
+ if (UTF16.isLeadSurrogate(lead)) {
+ retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+ ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
+ } else {
+ ci.next();
+ }
+ }
+ return retVal;
+ }
+ public static int current32(CharacterIterator ci) {
+ char lead = ci.current();
+ int retVal = lead;
+ return retVal;
+ }
+ if (UTF16.isLeadSurrogate(lead)) {
+ int trail = (int)ci.next();
+ ci.previous();
+ if (UTF16.isTrailSurrogate((char)trail)) {
+ retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
+ }
+ } else {
+ if (lead == CharacterIterator.DONE) {
+ if (ci.getIndex() >= ci.getEndIndex()) {
+ retVal = DONE32;
+ }
+ }
+ }
+ return retVal;
+ }
BreakIteratorCache cache = new BreakIteratorCache(where, result);
iterCache[kind] = new SoftReference<BreakIteratorCache>(cache);
+ if (result instanceof RuleBasedBreakIterator) {
+ RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
+ rbbi.setBreakType(kind);
+ }
return result;
- * Copyright (C) 2002-2010, International Business Machines Corporation and *
+ * Copyright (C) 2002-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
* pre-compiled break rules. The resource bundle name is "boundaries".
* The value for each key will be the rules to be used for the
* specified locale - "word" -> "word_th" for Thai, for example.
- * DICTIONARY_POSSIBLE indexes in the same way, and indicates whether a
- * dictionary is a possibility for that type of break. This is just
- * an optimization to avoid a resource lookup where no dictionary is
- * ever possible.
private static final String[] KIND_NAMES = {
"grapheme", "word", "line", "sentence", "title"
- };
- private static final boolean[] DICTIONARY_POSSIBLE = {
- false, true, true, false, false
private static BreakIterator createBreakInstance(ULocale locale, int kind) {
- BreakIterator iter = null;
- ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
+ RuleBasedBreakIterator iter = null;
+ ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME, locale);
- // Get the binary rules. These are needed for both normal RulesBasedBreakIterators
- // and for Dictionary iterators.
- //
+ // Get the binary rules.
+ //
InputStream ruleStream = null;
try {
String typeKey = KIND_NAMES[kind];
catch (Exception e) {
throw new MissingResourceException(e.toString(),"","");
- // Check whether a dictionary exists, and create a DBBI iterator is
- // one does.
+ // Create a normal RuleBasedBreakIterator.
- // This type of break iterator could potentially use a dictionary.
- //
- try {
- if (locale.getLanguage().equals("th")){
- // If the language is Thai, load the thai compact trie dictionary.
- String dictType = "Thai";
- String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
- dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
- InputStream is = ICUData.getStream(dictFileName);
- iter = new ThaiBreakIterator(ruleStream, is);
- }
- } catch (MissingResourceException e) {
- // Couldn't find a dictionary.
- // This is normal, and will occur whenever creating a word or line
- // break iterator for a locale that does not have a BreakDictionaryData
- // resource - meaning for all but Thai.
- // Fall through to creating a normal RulebasedBreakIterator.
- } catch (IOException e) {
- Assert.fail(e);
- }
- }
- if (iter == null) {
- //
- // Create a normal RuleBasedBreakIterator.
- // We have determined that this is not supposed to be a dictionary iterator.
- //
- try {
- iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
- }
- catch (IOException e) {
- // Shouldn't be possible to get here.
- // If it happens, the compiled rules are probably corrupted in some way.
- Assert.fail(e);
- }
+ try {
+ iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(ruleStream);
+ }
+ catch (IOException e) {
+ // Shouldn't be possible to get here.
+ // If it happens, the compiled rules are probably corrupted in some way.
+ Assert.fail(e);
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
+ iter.setBreakType(kind);
return iter;
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+import com.ibm.icu.impl.Assert;
+import com.ibm.icu.util.BytesTrie;
+import com.ibm.icu.util.BytesTrie.Result;
+class BytesDictionaryMatcher extends DictionaryMatcher {
+ private final byte[] characters;
+ private final int transform;
+ public BytesDictionaryMatcher(byte[] chars, int transform) {
+ characters = chars;
+ Assert.assrt((transform & DictionaryData.TRANSFORM_TYPE_MASK) == DictionaryData.TRANSFORM_TYPE_OFFSET);
+ // while there is only one transform type so far, save the entire transform constant so that
+ // if we add any others, we need only change code in transform() and the assert above rather
+ // than adding a "transform type" variable
+ this.transform = transform;
+ }
+ private int transform(int c) {
+ if (c == 0x200D) {
+ return 0xFF;
+ } else if (c == 0x200C) {
+ return 0xFE;
+ }
+ int delta = c - (transform & DictionaryData.TRANSFORM_OFFSET_MASK);
+ if (delta < 0 || 0xFD < delta) {
+ return -1;
+ }
+ return delta;
+ }
+ public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
+ UCharacterIterator text = UCharacterIterator.getInstance(text_);
+ BytesTrie bt = new BytesTrie(characters, 0);
+ int c = text.nextCodePoint();
+ Result result = bt.first(transform(c));
+ // TODO: should numChars count Character.charCount() ?
+ int numChars = 1;
+ int count = 0;
+ for (;;) {
+ if (result.hasValue()) {
+ if (count < limit) {
+ if (values != null) {
+ values[count] = bt.getValue();
+ }
+ lengths[count] = numChars;
+ count++;
+ }
+ if (result == Result.FINAL_VALUE) {
+ break;
+ }
+ } else if (result == Result.NO_MATCH) {
+ break;
+ }
+ if (numChars >= maxLength) {
+ break;
+ }
+ c = text.nextCodePoint();
+ ++numChars;
+ result = bt.next(transform(c));
+ }
+ count_[0] = count;
+ return numChars;
+ }
+ public int getType() {
+ return DictionaryData.TRIE_TYPE_BYTES;
+ }
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+import com.ibm.icu.util.BytesTrie.Result;
+import com.ibm.icu.util.CharsTrie;
+class CharsDictionaryMatcher extends DictionaryMatcher {
+ private CharSequence characters;
+ public CharsDictionaryMatcher(CharSequence chars) {
+ characters = chars;
+ }
+ public int matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) {
+ UCharacterIterator text = UCharacterIterator.getInstance(text_);
+ CharsTrie uct = new CharsTrie(characters, 0);
+ int c = text.nextCodePoint();
+ Result result = uct.firstForCodePoint(c);
+ // TODO: should numChars count Character.charCount?
+ int numChars = 1;
+ int count = 0;
+ for (;;) {
+ if (result.hasValue()) {
+ if (count < limit) {
+ if (values != null) {
+ values[count] = uct.getValue();
+ }
+ lengths[count] = numChars;
+ count++;
+ }
+ if (result == Result.FINAL_VALUE) {
+ break;
+ }
+ } else if (result == Result.NO_MATCH) {
+ break;
+ }
+ if (numChars >= maxLength) {
+ break;
+ }
+ c = text.nextCodePoint();
+ ++numChars;
+ result = uct.nextForCodePoint(c);
+ }
+ count_[0] = count;
+ return numChars;
+ }
+ public int getType() {
+ return DictionaryData.TRIE_TYPE_UCHARS;
+ }
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.io.IOException;
+import java.text.CharacterIterator;
+import java.util.Stack;
+import com.ibm.icu.impl.Assert;
+import static com.ibm.icu.impl.CharacterIteration.*;
+public class CjkBreakEngine implements LanguageBreakEngine {
+ private static final UnicodeSet fHangulWordSet = new UnicodeSet();
+ private static final UnicodeSet fHanWordSet = new UnicodeSet();
+ private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
+ private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
+ static {
+ fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
+ fHanWordSet.applyPattern("[:Han:]");
+ fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
+ fHiraganaWordSet.applyPattern("[:Hiragana:]");
+ // freeze them all
+ fHangulWordSet.freeze();
+ fHanWordSet.freeze();
+ fKatakanaWordSet.freeze();
+ fHiraganaWordSet.freeze();
+ }
+ private final UnicodeSet fWordSet;
+ private DictionaryMatcher fDictionary = null;
+ public CjkBreakEngine(boolean korean) throws IOException {
+ fDictionary = DictionaryData.loadDictionaryFor("Hira");
+ if (korean) {
+ fWordSet = fHangulWordSet;
+ } else {
+ fWordSet = new UnicodeSet();
+ fWordSet.addAll(fHanWordSet);
+ fWordSet.addAll(fKatakanaWordSet);
+ fWordSet.addAll(fHiraganaWordSet);
+ fWordSet.add("\\uff70\\u30fc");
+ }
+ }
+ public boolean handles(int c, int breakType) {
+ return (breakType == BreakIterator.KIND_WORD) &&
+ (fWordSet.contains(c));
+ }
+ private static final int kMaxKatakanaLength = 8;
+ private static final int kMaxKatakanaGroupLength = 20;
+ private static final int maxSnlp = 255;
+ private static final int kint32max = Integer.MAX_VALUE;
+ private static int getKatakanaCost(int wordlength) {
+ int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
+ return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
+ }
+ private static boolean isKatakana(int value) {
+ return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
+ (value >= 0xFF66 && value <= 0xFF9F);
+ }
+ public int findBreaks(CharacterIterator inText, int startPos, int endPos,
+ boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+ if (startPos >= endPos) {
+ return 0;
+ }
+ inText.setIndex(startPos);
+ int inputLength = endPos - startPos;
+ int[] charPositions = new int[inputLength + 1];
+ StringBuffer s = new StringBuffer("");
+ inText.setIndex(startPos);
+ while (inText.getIndex() < endPos) {
+ s.append(inText.current());
+ inText.next();
+ }
+ String prenormstr = s.toString();
+ boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
+ Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
+ CharacterIterator text = inText;
+ int numChars = 0;
+ if (isNormalized) {
+ int index = 0;
+ charPositions[0] = 0;
+ while (index < prenormstr.length()) {
+ int codepoint = prenormstr.codePointAt(index);
+ index += Character.charCount(codepoint);
+ numChars++;
+ charPositions[numChars] = index;
+ }
+ } else {
+ String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
+ text = new java.text.StringCharacterIterator(normStr);
+ charPositions = new int[normStr.length() + 1];
+ Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
+ int index = 0;
+ charPositions[0] = 0;
+ while (index < normalizer.endIndex()) {
+ normalizer.next();
+ numChars++;
+ index = normalizer.getIndex();
+ charPositions[numChars] = index;
+ }
+ }
+ // From here on out, do the algorithm. Note that our indices
+ // refer to indices within the normalized string.
+ int[] bestSnlp = new int[numChars + 1];
+ bestSnlp[0] = 0;
+ for (int i = 1; i <= numChars; i++) {
+ bestSnlp[i] = kint32max;
+ }
+ int[] prev = new int[numChars + 1];
+ for (int i = 0; i <= numChars; i++) {
+ prev[i] = -1;
+ }
+ final int maxWordSize = 20;
+ int values[] = new int[numChars];
+ int lengths[] = new int[numChars];
+ // dynamic programming to find the best segmentation
+ boolean is_prev_katakana = false;
+ for (int i = 0; i < numChars; i++) {
+ text.setIndex(i);
+ if (bestSnlp[i] == kint32max) {
+ continue;
+ }
+ int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
+ int[] count_ = new int[1];
+ fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
+ int count = count_[0];
+ // if there are no single character matches found in the dictionary
+ // starting with this character, treat character as a 1-character word
+ // with the highest value possible (i.e. the least likely to occur).
+ // Exclude Korean characters from this treatment, as they should be
+ // left together by default.
+ if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
+ values[count] = maxSnlp;
+ lengths[count] = 1;
+ count++;
+ }
+ for (int j = 0; j < count; j++) {
+ int newSnlp = bestSnlp[i] + values[j];
+ if (newSnlp < bestSnlp[lengths[j] + i]) {
+ bestSnlp[lengths[j] + i] = newSnlp;
+ prev[lengths[j] + i] = i;
+ }
+ }
+ // In Japanese, single-character Katakana words are pretty rare.
+ // So we apply the following heuristic to Katakana: any continuous
+ // run of Katakana characters is considered a candidate word with
+ // a default cost specified in the katakanaCost table according
+ // to its length.
+ text.setIndex(i);
+ boolean is_katakana = isKatakana(current32(text));
+ if (!is_prev_katakana && is_katakana) {
+ int j = i + 1;
+ next32(text);
+ while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
+ next32(text);
+ ++j;
+ }
+ if ((j - i) < kMaxKatakanaGroupLength) {
+ int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
+ if (newSnlp < bestSnlp[j]) {
+ bestSnlp[j] = newSnlp;
+ prev[j] = i;
+ }
+ }
+ }
+ is_prev_katakana = is_katakana;
+ }
+ int t_boundary[] = new int[numChars + 1];
+ int numBreaks = 0;
+ if (bestSnlp[numChars] == kint32max) {
+ t_boundary[numBreaks] = numChars;
+ numBreaks++;
+ } else {
+ for (int i = numChars; i > 0; i = prev[i]) {
+ t_boundary[numBreaks] = i;
+ numBreaks++;
+ }
+ Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
+ }
+ if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
+ t_boundary[numBreaks++] = 0;
+ }
+ for (int i = numBreaks - 1; i >= 0; i--) {
+ int pos = charPositions[t_boundary[i]] + startPos;
+ if (!(foundBreaks.contains(pos) || pos == startPos))
+ foundBreaks.push(charPositions[t_boundary[i]] + startPos);
+ }
+ if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
+ foundBreaks.pop();
+ if (!foundBreaks.empty())
+ inText.setIndex(foundBreaks.peek());
+ return 0;
+ }
+++ /dev/null
- *******************************************************************************
- * Copyright (C) 1996-2010, International Business Machines Corporation and *
- * others. All Rights Reserved. *
- *******************************************************************************
- */
-package com.ibm.icu.text;
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.CharacterIterator;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Stack;
-import com.ibm.icu.impl.Assert;
- * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
- * to further subdivide ranges of text beyond what is possible using just the
- * state-table-based algorithm. This is necessary, for example, to handle
- * word and line breaking in Thai, which doesn't use spaces between words. The
- * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
- * up text as far as possible, and then contiguous ranges of letters are
- * repeatedly compared against a list of known words (i.e., the dictionary)
- * to divide them up into words.
- *
- * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
- * but adds one more special substitution name: _dictionary_. This substitution
- * name is used to identify characters in words in the dictionary. The idea is that
- * if the iterator passes over a chunk of text that includes two or more characters
- * in a row that are included in _dictionary_, it goes back through that range and
- * derives additional break positions (if possible) using the dictionary.
- *
- * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
- * file. It uses Class.getResource() to locate the dictionary file. The
- * dictionary file is in a serialized binary format. We have a very primitive (and
- * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
- * currently making it public. Contact us for help.
- *
- * @stable ICU 2.0
- */
-public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
- /**
- * Keeps track of if we are using the compact trie dictionary.
- */
- private boolean usingCTDictionary = false;
- /**
- * a list of known words that is used to divide up contiguous ranges of letters,
- * stored in a compressed, indexed, format that offers fast access
- */
- private BreakDictionary dictionary;
- /*
- * a list of flags indicating which character categories are contained in
- * the dictionary file (this is used to determine which ranges of characters
- * to apply the dictionary to)
- */
- //private boolean[] categoryFlags;
- /**
- * when a range of characters is divided up using the dictionary, the break
- * positions that are discovered are stored here, preventing us from having
- * to use either the dictionary or the state table again until the iterator
- * leaves this range of text
- */
- int[] cachedBreakPositions;
- /**
- * if cachedBreakPositions is not null, this indicates which item in the
- * cache the current iteration position refers to
- */
- int positionInCache;
- /**
- * Special variable name for characters in words in dictionary
- */
- /**
- * Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine
- * uses the BreakCTDictionary.
- * @param compiledRules an input stream containing the binary (flattened) compiled rules.
- * @internal
- * @deprecated This API is ICU internal only.
- */
- protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException {
- fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
- dictionary = null;
- usingCTDictionary = true;
- }
- /**
- * Constructs a DictionaryBasedBreakIterator.
- * @param rules Same as the rules parameter on RuleBasedBreakIterator,
- * except for the special meaning of "_dictionary_". This parameter is just
- * passed through to RuleBasedBreakIterator constructor.
- * @param dictionaryStream the stream containing the dictionary data
- * @stable ICU 2.0
- */
- public DictionaryBasedBreakIterator(String rules,
- InputStream dictionaryStream) throws IOException {
- super(rules);
- dictionary = new BreakDictionary(dictionaryStream);
- }
- /**
- * Construct a DictionarBasedBreakIterator from precompiled rules.
- * @param compiledRules an input stream containing the binary (flattened) compiled rules.
- * @param dictionaryStream an input stream containing the dictionary data
- * @internal
- * @deprecated This API is ICU internal only.
- */
- public DictionaryBasedBreakIterator(InputStream compiledRules,
- InputStream dictionaryStream) throws IOException {
- fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
- dictionary = new BreakDictionary(dictionaryStream);
- }
- /** @stable ICU 2.0 */
- public void setText(CharacterIterator newText) {
- super.setText(newText);
- cachedBreakPositions = null;
- fDictionaryCharCount = 0;
- positionInCache = 0;
- }
- /**
- * Sets the current iteration position to the beginning of the text.
- * (i.e., the CharacterIterator's starting offset).
- * @return The offset of the beginning of the text.
- * @stable ICU 2.0
- */
- public int first() {
- cachedBreakPositions = null;
- fDictionaryCharCount = 0;
- positionInCache = 0;
- return super.first();
- }
- /**
- * Sets the current iteration position to the end of the text.
- * (i.e., the CharacterIterator's ending offset).
- * @return The text's past-the-end offset.
- * @stable ICU 2.0
- */
- public int last() {
- cachedBreakPositions = null;
- fDictionaryCharCount = 0;
- positionInCache = 0;
- return super.last();
- }
- /**
- * Advances the iterator one step backwards.
- * @return The position of the last boundary position before the
- * current iteration position
- * @stable ICU 2.0
- */
- public int previous() {
- CharacterIterator text = getText();
- // if we have cached break positions and we're still in the range
- // covered by them, just move one step backward in the cache
- if (cachedBreakPositions != null && positionInCache > 0) {
- --positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return cachedBreakPositions[positionInCache];
- }
- // otherwise, dump the cache and use the inherited previous() method to move
- // backward. This may fill up the cache with new break positions, in which
- // case we have to mark our position in the cache. If it doesn't, use next()
- // to move forward until we hit or pass the current position. This *will* fill
- // the cache.
- else {
- cachedBreakPositions = null;
- int offset = current();
- int result = super.previous();
- if (cachedBreakPositions != null) {
- positionInCache = cachedBreakPositions.length - 2;
- return result;
- }
- while (result < offset) {
- int nextResult = next();
- if (nextResult >= offset) {
- break;
- }
- result = nextResult;
- }
- if (cachedBreakPositions != null) {
- positionInCache = cachedBreakPositions.length - 2;
- }
- if (result != BreakIterator.DONE) {
- text.setIndex(result);
- }
- return result;
- }
- }
- /**
- * Sets the current iteration position to the last boundary position
- * before the specified position.
- * @param offset The position to begin searching from
- * @return The position of the last boundary before "offset"
- * @stable ICU 2.0
- */
- public int preceding(int offset) {
- CharacterIterator text = getText();
- checkOffset(offset, text);
- // if we have no cached break positions, or "offset" is outside the
- // range covered by the cache, we can just call the inherited routine
- // (which will eventually call other routines in this class that may
- // refresh the cache)
- if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
- offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
- cachedBreakPositions = null;
- return super.preceding(offset);
- }
- // on the other hand, if "offset" is within the range covered by the cache,
- // then all we have to do is search the cache for the last break position
- // before "offset"
- else {
- positionInCache = 0;
- while (positionInCache < cachedBreakPositions.length
- && offset > cachedBreakPositions[positionInCache])
- ++positionInCache;
- --positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return text.getIndex();
- }
- }
- /**
- * Sets the current iteration position to the first boundary position after
- * the specified position.
- * @param offset The position to begin searching forward from
- * @return The position of the first boundary after "offset"
- * @stable ICU 2.0
- */
- public int following(int offset) {
- CharacterIterator text = getText();
- checkOffset(offset, text);
- // if we have no cached break positions, or if "offset" is outside the
- // range covered by the cache, then dump the cache and call our
- // inherited following() method. This will call other methods in this
- // class that may refresh the cache.
- if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
- offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
- cachedBreakPositions = null;
- return super.following(offset);
- }
- // on the other hand, if "offset" is within the range covered by the
- // cache, then just search the cache for the first break position
- // after "offset"
- else {
- positionInCache = 0;
- while (positionInCache < cachedBreakPositions.length
- && offset >= cachedBreakPositions[positionInCache])
- ++positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return text.getIndex();
- }
- }
- /**
- * Return the status tag from the break rule that determined the most recently
- * returned break position.
- *
- * TODO: not supported with dictionary based break iterators.
- *
- * @return the status from the break rule that determined the most recently
- * returned break position.
- * @draft ICU 3.0
- * @provisional This API might change or be removed in a future release.
- */
- public int getRuleStatus() {
- return 0;
- }
- /**
- * Get the status (tag) values from the break rule(s) that determined the most
- * recently returned break position. The values appear in the rule source
- * within brackets, {123}, for example. The default status value for rules
- * that do not explicitly provide one is zero.
- * <p>
- * TODO: not supported for dictionary based break iterator.
- *
- * @param fillInArray an array to be filled in with the status values.
- * @return The number of rule status values from rules that determined
- * the most recent boundary returned by the break iterator.
- * In the event that the array is too small, the return value
- * is the total number of status values that were available,
- * not the reduced number that were actually returned.
- * @draft ICU 3.0
- * @provisional This API might change or be removed in a future release.
- */
- public int getRuleStatusVec(int[] fillInArray) {
- if (fillInArray != null && fillInArray.length>=1) {
- fillInArray[0] = 0;
- }
- return 1;
- }
- /**
- * This is the implementation function for next().
- * @internal
- * @deprecated This API is ICU internal only.
- */
- protected int handleNext() {
- CharacterIterator text = getText();
- // if there are no cached break positions, or if we've just moved
- // off the end of the range covered by the cache, we have to dump
- // and possibly regenerate the cache
- if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
- // start by using the inherited handleNext() to find a tentative return
- // value. dictionaryCharCount tells us how many dictionary characters
- // we passed over on our way to the tentative return value
- int startPos = text.getIndex();
- fDictionaryCharCount = 0;
- int result = super.handleNext();
- // if we passed over more than one dictionary character, then we use
- // divideUpDictionaryRange() to regenerate the cached break positions
- // for the new range.
- if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) {
- divideUpDictionaryRange(startPos, result);
- }
- // otherwise, the value we got back from the inherited fuction
- // is our return value, and we can dump the cache
- else {
- cachedBreakPositions = null;
- return result;
- }
- }
- // if the cache of break positions has been regenerated (or existed all
- // along), then just advance to the next break position in the cache
- // and return it
- if (cachedBreakPositions != null) {
- ++positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return cachedBreakPositions[positionInCache];
- }
- Assert.assrt(false);
- return -9999; // SHOULD NEVER GET HERE!
- }
- /**
- * This is the function that actually implements the dictionary-based
- * algorithm. Given the endpoints of a range of text, it uses the
- * dictionary to determine the positions of any boundaries in this
- * range. It stores all the boundary positions it discovers in
- * cachedBreakPositions so that we only have to do this work once
- * for each time we enter the range.
- */
- @SuppressWarnings("unchecked")
- private void divideUpDictionaryRange(int startPos, int endPos) {
- CharacterIterator text = getText();
- // the range we're dividing may begin or end with non-dictionary characters
- // (i.e., for line breaking, we may have leading or trailing punctuation
- // that needs to be kept with the word). Seek from the beginning of the
- // range to the first dictionary character
- text.setIndex(startPos);
- int c = CICurrent32(text);
- while (isDictionaryChar(c) == false) {
- c = CINext32(text);
- }
- //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
- // initialize. We maintain two stacks: currentBreakPositions contains
- // the list of break positions that will be returned if we successfully
- // finish traversing the whole range now. possibleBreakPositions lists
- // all other possible word ends we've passed along the way. (Whenever
- // we reach an error [a sequence of characters that can't begin any word
- // in the dictionary], we back up, possibly delete some breaks from
- // currentBreakPositions, move a break from possibleBreakPositions
- // to currentBreakPositions, and start over from there. This process
- // continues in this way until we either successfully make it all the way
- // across the range, or exhaust all of our combinations of break
- // positions.)
- Stack<Integer> currentBreakPositions = new Stack<Integer>();
- Stack<Integer> possibleBreakPositions = new Stack<Integer>();
- List<Integer> wrongBreakPositions = new ArrayList<Integer>();
- // the dictionary is implemented as a trie, which is treated as a state
- // machine. -1 represents the end of a legal word. Every word in the
- // dictionary is represented by a path from the root node to -1. A path
- // that ends in state 0 is an illegal combination of characters.
- int state = 0;
- // these two variables are used for error handling. We keep track of the
- // farthest we've gotten through the range being divided, and the combination
- // of breaks that got us that far. If we use up all possible break
- // combinations, the text contains an error or a word that's not in the
- // dictionary. In this case, we "bless" the break positions that got us the
- // farthest as real break positions, and then start over from scratch with
- // the character where the error occurred.
- int farthestEndPoint = text.getIndex();
- Stack<Integer> bestBreakPositions = null;
- // initialize (we always exit the loop with a break statement)
- c = CICurrent32(text);
- while (true) {
-//System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
- // if we can transition to state "-1" from our current state, we're
- // on the last character of a legal word. Push that position onto
- // the possible-break-positions stack
- if (dictionary.at(state, 0) == -1) {
- possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
- }
- // look up the new state to transition to in the dictionary
- // There will be no supplementaries here because the Thai dictionary
- // does not include any. This code is going away soon, not worth
- // fixing.
- state = (dictionary.at(state, (char)c)) & 0xFFFF; // TODO: fix supplementaries
-//System.out.print(", state = " + state);
- // if the character we're sitting on causes us to transition to
- // the "end of word" state, then it was a non-dictionary character
- // and we've successfully traversed the whole range. Drop out
- // of the loop.
- if (state == /*-1*/ 0xFFFF) {
- currentBreakPositions.push(Integer.valueOf(text.getIndex()));
- break;
- }
- // if the character we're sitting on causes us to transition to
- // the error state, or if we've gone off the end of the range
- // without transitioning to the "end of word" state, we've hit
- // an error...
- else if (state == 0 || text.getIndex() >= endPos) {
- // if this is the farthest we've gotten, take note of it in
- // case there's an error in the text
- if (text.getIndex() > farthestEndPoint) {
- farthestEndPoint = text.getIndex();
- bestBreakPositions = (Stack<Integer>)(currentBreakPositions.clone());
- }
- // wrongBreakPositions is a list of all break positions we've tried starting
- // that didn't allow us to traverse all the way through the text. Every time
- // we pop a break position off of currentBreakPositions, we put it into
- // wrongBreakPositions to avoid trying it again later. If we make it to this
- // spot, we're either going to back up to a break in possibleBreakPositions
- // and try starting over from there, or we've exhausted all possible break
- // positions and are going to do the fallback procedure. This loop prevents
- // us from messing with anything in possibleBreakPositions that didn't work as
- // a starting point the last time we tried it (this is to prevent a bunch of
- // repetitive checks from slowing down some extreme cases)
- // variable not used Integer newStartingSpot = null;
- while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
- possibleBreakPositions.peek())) {
- possibleBreakPositions.pop();
- }
- // if we've used up all possible break-position combinations, there's
- // an error or an unknown word in the text. In this case, we start
- // over, treating the farthest character we've reached as the beginning
- // of the range, and "blessing" the break positions that got us that
- // far as real break positions
- if (possibleBreakPositions.isEmpty()) {
- if (bestBreakPositions != null) {
- currentBreakPositions = bestBreakPositions;
- if (farthestEndPoint < endPos) {
- text.setIndex(farthestEndPoint + 1);
- }
- else {
- break;
- }
- }
- else {
- if ((currentBreakPositions.size() == 0
- || currentBreakPositions.peek().intValue() != text.getIndex())
- && text.getIndex() != startPos) {
- currentBreakPositions.push(Integer.valueOf(text.getIndex()));
- }
- CINext32(text);
- currentBreakPositions.push(Integer.valueOf(text.getIndex()));
- }
- }
- // if we still have more break positions we can try, then promote the
- // last break in possibleBreakPositions into currentBreakPositions,
- // and get rid of all entries in currentBreakPositions that come after
- // it. Then back up to that position and start over from there (i.e.,
- // treat that position as the beginning of a new word)
- else {
- Integer temp = possibleBreakPositions.pop();
- Integer temp2 = null;
- while (!currentBreakPositions.isEmpty() && temp.intValue() <
- currentBreakPositions.peek().intValue()) {
- temp2 = currentBreakPositions.pop();
- wrongBreakPositions.add(temp2);
- }
- currentBreakPositions.push(temp);
- text.setIndex(currentBreakPositions.peek().intValue());
- }
- // re-sync "c" for the next go-round, and drop out of the loop if
- // we've made it off the end of the range
- c = CICurrent32(text);
- state = 0;
- if (text.getIndex() >= endPos) {
- break;
- }
- }
- // if we didn't hit any exceptional conditions on this last iteration,
- // just advance to the next character and loop
- else {
- c = CINext32(text);
- }
-//System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
-//System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
- }
- // dump the last break position in the list, and replace it with the actual
- // end of the range (which may be the same character, or may be further on
- // because the range actually ended with non-dictionary characters we want to
- // keep with the word)
- if (!currentBreakPositions.isEmpty()) {
- currentBreakPositions.pop();
- }
- currentBreakPositions.push(Integer.valueOf(endPos));
- // create a regular array to hold the break positions and copy
- // the break positions from the stack to the array (in addition,
- // our starting position goes into this array as a break position).
- // This array becomes the cache of break positions used by next()
- // and previous(), so this is where we actually refresh the cache.
- cachedBreakPositions = new int[currentBreakPositions.size() + 1];
- cachedBreakPositions[0] = startPos;
- for (int i = 0; i < currentBreakPositions.size(); i++) {
- cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
- }
- positionInCache = 0;
- }
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+import java.util.Stack;
+abstract class DictionaryBreakEngine implements LanguageBreakEngine {
+ protected UnicodeSet fSet = new UnicodeSet();
+ private final int fTypes;
+ /**
+ * @param breakTypes A mask of the break iterators that can use this engine.
+ * For example, (1 << KIND_WORD) | (1 << KIND_LINE) could be used by
+ * word iterators and line iterators, but not any other kind.
+ */
+ public DictionaryBreakEngine(int breakTypes) {
+ // TODO: consider using a java.util.BitSet with nbits <= 32
+ fTypes = breakTypes;
+ }
+ public boolean handles(int c, int breakType) {
+ return (breakType >= 0 && breakType < 32) && // breakType is in range
+ ((1 << breakType) & fTypes) != 0 && // this type can use us
+ fSet.contains(c); // we recognize the character
+ }
+ public int findBreaks(CharacterIterator text_, int startPos, int endPos,
+ boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+ if (breakType < 0 || breakType >= 32 ||
+ ((1 << breakType) & fTypes) == 0) {
+ return 0;
+ }
+ int result = 0;
+ UCharacterIterator text = UCharacterIterator.getInstance(text_);
+ int start = text.getIndex();
+ int current, rangeStart, rangeEnd;
+ int c = text.current();
+ if (reverse) {
+ boolean isDict = fSet.contains(c);
+ while ((current = text.getIndex()) > startPos && isDict) {
+ c = text.previous();
+ isDict = fSet.contains(c);
+ }
+ rangeStart = (current < startPos) ? startPos :
+ current + (isDict ? 0 : 1);
+ rangeEnd = start + 1;
+ } else {
+ while ((current = text.getIndex()) < endPos && fSet.contains(c)) {
+ c = text.next();
+ }
+ rangeStart = start;
+ rangeEnd = current;
+ }
+ result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
+ text.setIndex(current);
+ return result;
+ }
+ protected abstract int divideUpDictionaryRange(UCharacterIterator text,
+ int rangeStart, int rangeEnd, Stack<Integer> foundBreaks);
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import com.ibm.icu.impl.Assert;
+import com.ibm.icu.impl.ICUBinary;
+import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.util.UResourceBundle;
+final class DictionaryData {
+ // disallow instantiation
+ private DictionaryData() { }
+ public static final int TRIE_TYPE_BYTES = 0;
+ public static final int TRIE_TYPE_UCHARS = 1;
+ public static final int TRIE_TYPE_MASK = 7;
+ public static final int TRIE_HAS_VALUES = 8;
+ public static final int TRANSFORM_NONE = 0;
+ public static final int TRANSFORM_TYPE_OFFSET = 0x1000000;
+ public static final int TRANSFORM_TYPE_MASK = 0x7f000000;
+ public static final int TRANSFORM_OFFSET_MASK = 0x1fffff;
+ public static final int IX_STRING_TRIE_OFFSET = 0;
+ public static final int IX_RESERVED1_OFFSET = 1;
+ public static final int IX_RESERVED2_OFFSET = 2;
+ public static final int IX_TOTAL_SIZE = 3;
+ public static final int IX_TRIE_TYPE = 4;
+ public static final int IX_TRANSFORM = 5;
+ public static final int IX_RESERVED6 = 6;
+ public static final int IX_RESERVED7 = 7;
+ public static final int IX_COUNT = 8;
+ private static final byte DATA_FORMAT_ID[] = { (byte) 0x44, (byte) 0x69,
+ (byte) 0x63, (byte) 0x74 };
+ public static DictionaryMatcher loadDictionaryFor(String dictType) throws IOException {
+ ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BRKITR_BASE_NAME);
+ String dictFileName = rb.getStringWithFallback("dictionaries/" + dictType);
+ dictFileName = ICUResourceBundle.ICU_BUNDLE +ICUResourceBundle.ICU_BRKITR_NAME+ "/" + dictFileName;
+ InputStream is = ICUData.getStream(dictFileName);
+ ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
+ DataInputStream s = new DataInputStream(is);
+ int[] indexes = new int[IX_COUNT];
+ // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[]
+ for (int i = 0; i < IX_COUNT; i++) {
+ indexes[i] = s.readInt();
+ }
+ int offset = indexes[IX_STRING_TRIE_OFFSET];
+ Assert.assrt(offset >= (4 * IX_COUNT));
+ if (offset > (4 * IX_COUNT)) {
+ int diff = offset - (4 * IX_COUNT);
+ s.skipBytes(diff);
+ }
+ int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK;
+ int totalSize = indexes[IX_TOTAL_SIZE] - offset;
+ DictionaryMatcher m = null;
+ if (trieType == TRIE_TYPE_BYTES) {
+ int transform = indexes[IX_TRANSFORM];
+ byte[] data = new byte[totalSize];
+ int i;
+ for (i = 0; i < data.length; i++) {
+ data[i] = s.readByte();
+ }
+ Assert.assrt(i == totalSize);
+ m = new BytesDictionaryMatcher(data, transform);
+ } else if (trieType == TRIE_TYPE_UCHARS) {
+ Assert.assrt(totalSize % 2 == 0);
+ int num = totalSize / 2;
+ char[] data = new char[totalSize / 2];
+ for (int i = 0; i < num; i++) {
+ data[i] = s.readChar();
+ }
+ m = new CharsDictionaryMatcher(new String(data));
+ } else {
+ m = null;
+ }
+ s.close();
+ is.close();
+ return m;
+ }
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+ * The DictionaryMatcher interface is used to allow arbitrary "types" of
+ * back-end data structures to be used with the break iteration code.
+ */
+abstract class DictionaryMatcher {
+ /**
+ * Find dictionary words that match the text.
+ *
+ * @param text A CharacterIterator representing the text. The iterator is
+ * left after the longest prefix match in the dictionary.
+ * @param maxLength The maximum number of code units to match.
+ * @param lengths An array that is filled with the lengths of words that matched.
+ * @param count Filled with the number of elements output in lengths.
+ * @param limit The maximum amount of words to output. Must be less than or equal to lengths.length.
+ * @param values Filled with the weight values associated with the various words.
+ * @return The number of characters in text that were matched.
+ */
+ public abstract int matches(CharacterIterator text, int maxLength, int[] lengths,
+ int[] count, int limit, int[] values);
+ public int matches(CharacterIterator text, int maxLength, int[] lengths,
+ int[] count, int limit) {
+ return matches(text, maxLength, lengths, count, limit, null);
+ }
+ /**
+ * @return the kind of dictionary that this matcher is using
+ */
+ public abstract int getType();
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+import java.util.Stack;
+ * The LanguageBreakEngine interface is to be used to implement any
+ * language-specific logic for break iteration.
+ */
+interface LanguageBreakEngine {
+ /**
+ * @param c A Unicode codepoint value
+ * @param breakType The kind of break iterator that is wanting to make use
+ * of this engine - character, word, line, sentence
+ * @return true if the engine can handle this character, false otherwise
+ */
+ public boolean handles(int c, int breakType);
+ /**
+ * Implements the actual breaking logic.
+ * @param text The text to break over
+ * @param startPos The index of the beginning of our range
+ * @param endPos The index of the possible end of our range. It is possible,
+ * however, that our range ends earlier
+ * @param reverse true iff we are iterating backwards (in a call to
+ * previous(), for example)
+ * @param breakType The kind of break iterator that is wanting to make use
+ * of this engine - character, word, line, sentence
+ * @param foundBreaks A Stack that the breaks found will be added to
+ * @return the number of words found
+ */
+ public int findBreaks(CharacterIterator text, int startPos, int endPos,
+ boolean reverse, int breakType, Stack<Integer> foundBreaks);
import java.io.InputStream;
import java.io.OutputStream;
import java.text.CharacterIterator;
+import java.util.Collections;
+import java.util.Set;
+import java.util.Stack;
+import java.util.concurrent.ConcurrentHashMap;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUDebug;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+import static com.ibm.icu.impl.CharacterIteration.*;
* Rule Based Break Iterator
* @stable ICU 2.0
public class RuleBasedBreakIterator extends BreakIterator {
// Constructors & Factories
This.fRData = RBBIDataWrapper.get(is);
return This;
- /*private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
- // TODO: check types.
- fRData = other.fRData;
- if (fText != null) {
- fText = (CharacterIterator)(other.fText.clone());
- }
- }*/
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
// Boilerplate
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
try {
RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
- if (fRData != other.fRData && (fRData == null || other.fRData == null)) {System.out.println("GOT HERE");
+ if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
if (fRData != null && other.fRData != null &&
return fRData.fRuleSource.hashCode();
* Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation.
public static final int WORD_IDEO_LIMIT = 500;
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* @internal
- * @deprecated This API is ICU internal only.
- protected int fDictionaryCharCount;
+ private int fDictionaryCharCount;
* Debugging flag. Trace operation of state machine when true.
public static boolean fTrace;
+ /**
+ * What kind of break iterator this is. Set to KIND_LINE by default,
+ * since this produces sensible output.
+ */
+ private int fBreakType = KIND_LINE;
+ /**
+ * The "default" break engine - just skips over ranges of dictionary words,
+ * producing no breaks. Should only be used if characters need to be handled
+ * by a dictionary but we have no dictionary implementation for them.
+ */
+ private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
+ /**
+ * when a range of characters is divided up using the dictionary, the break
+ * positions that are discovered are stored here, preventing us from having
+ * to use either the dictionary or the state table again until the iterator
+ * leaves this range of text
+ */
+ private int[] fCachedBreakPositions;
+ /**
+ * if fCachedBreakPositions is not null, this indicates which item in the
+ * cache the current iteration position refers to
+ */
+ private int fPositionInCache;
+ /**
+ * Whether or not we should be using the dictionary. Set to true by
+ * default - only set to false if we get an empty string as input or
+ * if our "kind" is not KIND_WORD or KIND_LINE.
+ *
+ * If this is set to false, no dictionary handling is done.
+ */
+ private boolean fUseDictionary = true;
+ private final Set<LanguageBreakEngine> fBreakEngines = Collections.newSetFromMap(new ConcurrentHashMap<LanguageBreakEngine, Boolean>());
* ICU debug argument name for RBBI
private void init() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
+ fBreakEngines.add(fUnhandledBreakEngine);
if (debugInitDone == false) {
fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG)
* @stable ICU 2.0
public int first() {
+ fCachedBreakPositions = null;
+ fDictionaryCharCount = 0;
+ fPositionInCache = 0;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null) {
return fText.getIndex();
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @stable ICU 2.0
public int last() {
+ fCachedBreakPositions = null;
+ fDictionaryCharCount = 0;
+ fPositionInCache = 0;
if (fText == null) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
- // I'm not sure why, but t.last() returns the offset of the last character,
+ // t.last() returns the offset of the last character,
// rather than the past-the-end offset
- //
- // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
- // will work correctly.)
+ // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
+ // will work correctly.
fLastStatusIndexValid = false;
int pos = fText.getEndIndex();
return pos;
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
return result;
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
public int previous() {
+ CharacterIterator text = getText();
+ fLastStatusIndexValid = false;
+ // if we have cached break positions and we're still in the range
+ // covered by them, just move one step backward in the cache
+ if (fCachedBreakPositions != null && fPositionInCache > 0) {
+ --fPositionInCache;
+ text.setIndex(fCachedBreakPositions[fPositionInCache]);
+ return fCachedBreakPositions[fPositionInCache];
+ }
+ // otherwise, dump the cache and use the inherited previous() method to move
+ // backward. This may fill up the cache with new break positions, in which
+ // case we have to mark our position in the cache. If it doesn't, use next()
+ // to move forward until we hit or pass the current position. This *will* fill
+ // the cache.
+ else {
+ // TODO: Try to reuse the array rather than reallocating it all the time
+ fCachedBreakPositions = null;
+ int offset = current();
+ int result = rulesPrevious();
+ if (result == BreakIterator.DONE) {
+ return result;
+ }
+ if (fDictionaryCharCount == 0) {
+ return result;
+ }
+ if (fCachedBreakPositions != null) {
+ fPositionInCache = fCachedBreakPositions.length - 2;
+ return result;
+ }
+ while (result < offset) {
+ int nextResult = handleNext();
+ if (nextResult >= offset) {
+ break;
+ }
+ result = nextResult;
+ }
+ if (fCachedBreakPositions != null) {
+ for (fPositionInCache = 0; fPositionInCache < fCachedBreakPositions.length; fPositionInCache++) {
+ if (fCachedBreakPositions[fPositionInCache] >= offset) {
+ fPositionInCache--;
+ break;
+ }
+ }
+ }
+ // prepare for the user asking for our status
+ // our status will have been marked as valid by the next()
+ // calls but isn't at the right place, so mark it as invalid
+ // and recompute it when the user asks
+ fLastStatusIndexValid = false;
+ text.setIndex(result);
+ return result;
+ }
+ }
+ private int rulesPrevious() {
// if we're already sitting at the beginning of the text, return DONE
if (fText == null || current() == fText.getBeginIndex()) {
fLastRuleStatusIndex = 0;
int start = current();
- CIPrevious32(fText);
+ previous32(fText);
int lastResult = handlePrevious(fRData.fRTable);
if (lastResult == BreakIterator.DONE) {
lastResult = fText.getBeginIndex();
fLastStatusIndexValid = breakTagValid;
return lastResult;
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @stable ICU 2.0
public int following(int offset) {
+ CharacterIterator text = getText();
+ // if we have no cached break positions, or if "offset" is outside the
+ // range covered by the cache, then dump the cache and call our
+ // inherited following() method. This will call other methods in this
+ // class that may refresh the cache.
+ if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
+ offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
+ fCachedBreakPositions = null;
+ return rulesFollowing(offset);
+ }
+ // on the other hand, if "offset" is within the range covered by the
+ // cache, then just search the cache for the first break position
+ // after "offset"
+ else {
+ fPositionInCache = 0;
+ while (fPositionInCache < fCachedBreakPositions.length
+ && offset >= fCachedBreakPositions[fPositionInCache])
+ ++fPositionInCache;
+ text.setIndex(fCachedBreakPositions[fPositionInCache]);
+ return text.getIndex();
+ }
+ }
+ private int rulesFollowing(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
// move forward one codepoint to prepare for moving back to a
// safe point.
// this handles offset being between a supplementary character
- CINext32(fText);
+ next32(fText);
// handlePrevious will move most of the time to < 1 boundary away
result = next();
// No Safe point reverse table, but there is a safe pt forward table.
- CIPrevious32(fText);
+ previous32(fText);
// handle next will give result >= offset
// previous will give result 0 or 1 boundary away from offset,
* @stable ICU 2.0
public int preceding(int offset) {
+ CharacterIterator text = getText();
+ // if we have no cached break positions, or "offset" is outside the
+ // range covered by the cache, we can just call the inherited routine
+ // (which will eventually call other routines in this class that may
+ // refresh the cache)
+ if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
+ offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
+ fCachedBreakPositions = null;
+ return rulesPreceding(offset);
+ }
+ // on the other hand, if "offset" is within the range covered by the cache,
+ // then all we have to do is search the cache for the last break position
+ // before "offset"
+ else {
+ fPositionInCache = 0;
+ while (fPositionInCache < fCachedBreakPositions.length
+ && offset > fCachedBreakPositions[fPositionInCache])
+ ++fPositionInCache;
+ --fPositionInCache;
+ text.setIndex(fCachedBreakPositions[fPositionInCache]);
+ return text.getIndex();
+ }
+ }
+ private int rulesPreceding(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// move backwards one codepoint to prepare for moving forwards to a
// safe point.
// this handles offset being between a supplementary character
- CIPrevious32(fText);
+ previous32(fText);
result = previous();
while (result >= offset) {
if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available
- CINext32(fText);
+ next32(fText);
// handle previous will give result <= offset
- * Returns true if the specfied position is a boundary position. As a side
- * effect, leaves the iterator pointing to the first boundary position at
- * or after "offset".
- * @param offset the offset to check.
- * @return True if "offset" is a boundary position.
- * @stable ICU 2.0
- */
-public boolean isBoundary(int offset) {
- checkOffset(offset, fText);
- // the beginning index of the iterator is always a boundary position by definition
- if (offset == fText.getBeginIndex()) {
- first(); // For side effects on current position, tag values.
- return true;
- }
+ /**
+ * Returns true if the specified position is a boundary position. As a side
+ * effect, leaves the iterator pointing to the first boundary position at
+ * or after "offset".
+ * @param offset the offset to check.
+ * @return True if "offset" is a boundary position.
+ * @stable ICU 2.0
+ */
+ public boolean isBoundary(int offset) {
+ checkOffset(offset, fText);
- if (offset == fText.getEndIndex()) {
- last(); // For side effects on current position, tag values.
- return true;
- }
+ // the beginning index of the iterator is always a boundary position by definition
+ if (offset == fText.getBeginIndex()) {
+ first(); // For side effects on current position, tag values.
+ return true;
+ }
- // otherwise, we can use following() on the position before the specified
- // one and return true if the position we get back is the one the user
- // specified
- // return following(offset - 1) == offset;
- // TODO: check whether it is safe to revert to the simpler offset-1 code
- // The safe rules may take care of unpaired surrogates ok.
- fText.setIndex(offset);
- CIPrevious32(fText);
- int pos = fText.getIndex();
- boolean result = following(pos) == offset;
- return result;
+ if (offset == fText.getEndIndex()) {
+ last(); // For side effects on current position, tag values.
+ return true;
+ }
- * Returns the current iteration position.
- * @return The current iteration position.
- * @stable ICU 2.0
- */
-public int current() {
- return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
- }
+ // otherwise, we can use following() on the position before the specified
+ // one and return true if the position we get back is the one the user
+ // specified
+ // return following(offset - 1) == offset;
+ // TODO: check whether it is safe to revert to the simpler offset-1 code
+ // The safe rules may take care of unpaired surrogates ok.
+ fText.setIndex(offset);
+ previous32(fText);
+ int pos = fText.getIndex();
+ boolean result = following(pos) == offset;
+ return result;
+ }
+ /**
+ * Returns the current iteration position.
+ * @return The current iteration position.
+ * @stable ICU 2.0
+ */
+ public int current() {
+ return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
+ }
-private void makeRuleStatusValid() {
- if (fLastStatusIndexValid == false) {
- // No cached status is available.
- if (fText == null || current() == fText.getBeginIndex()) {
- // At start of text, or there is no text. Status is always zero.
- fLastRuleStatusIndex = 0;
- fLastStatusIndexValid = true;
- } else {
- // Not at start of text. Find status the tedious way.
- int pa = current();
- previous();
- int pb = next();
- Assert.assrt (pa == pb);
+ private void makeRuleStatusValid() {
+ if (fLastStatusIndexValid == false) {
+ // No cached status is available.
+ int curr = current();
+ if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
+ // At start of text, or there is no text. Status is always zero.
+ fLastRuleStatusIndex = 0;
+ fLastStatusIndexValid = true;
+ } else {
+ // Not at start of text. Find status the tedious way.
+ int pa = fText.getIndex();
+ first();
+ int pb = current();
+ while (fText.getIndex() < pa) {
+ pb = next();
+ }
+ Assert.assrt(pa == pb);
+ }
+ Assert.assrt(fLastStatusIndexValid == true);
+ Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
- Assert.assrt(fLastStatusIndexValid == true);
- Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
- * Return the status tag from the break rule that determined the most recently
- * returned break position. The values appear in the rule source
- * within brackets, {123}, for example. For rules that do not specify a
- * status, a default value of 0 is returned. If more than one rule applies,
- * the numerically largest of the possible status values is returned.
- * <p>
- * Of the standard types of ICU break iterators, only the word break
- * iterator provides status values. The values are defined in
- * class RuleBasedBreakIterator, and allow distinguishing between words
- * that contain alphabetic letters, "words" that appear to be numbers,
- * punctuation and spaces, words containing ideographic characters, and
- * more. Call <code>getRuleStatus</code> after obtaining a boundary
- * position from <code>next()<code>, <code>previous()</code>, or
- * any other break iterator functions that returns a boundary position.
- * <p>
- * @return the status from the break rule that determined the most recently
- * returned break position.
- *
- * @draft ICU 3.0
- * @provisional This is a draft API and might change in a future release of ICU.
- */
-public int getRuleStatus() {
- makeRuleStatusValid();
- // Status records have this form:
- // Count N <-- fLastRuleStatusIndex points here.
- // Status val 0
- // Status val 1
- // ...
- // Status val N-1 <-- the value we need to return
- // The status values are sorted in ascending order.
- // This function returns the last (largest) of the array of status values.
- int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
- int tagVal = fRData.fStatusTable[idx];
- return tagVal;
+ /**
+ * Return the status tag from the break rule that determined the most recently
+ * returned break position. The values appear in the rule source
+ * within brackets, {123}, for example. For rules that do not specify a
+ * status, a default value of 0 is returned. If more than one rule applies,
+ * the numerically largest of the possible status values is returned.
+ * <p>
+ * Of the standard types of ICU break iterators, only the word break
+ * iterator provides status values. The values are defined in
+ * class RuleBasedBreakIterator, and allow distinguishing between words
+ * that contain alphabetic letters, "words" that appear to be numbers,
+ * punctuation and spaces, words containing ideographic characters, and
+ * more. Call <code>getRuleStatus</code> after obtaining a boundary
+ * position from <code>next()<code>, <code>previous()</code>, or
+ * any other break iterator functions that returns a boundary position.
+ * <p>
+ * @return the status from the break rule that determined the most recently
+ * returned break position.
+ *
+ * @draft ICU 3.0
+ * @provisional This is a draft API and might change in a future release of ICU.
+ */
+ public int getRuleStatus() {
+ makeRuleStatusValid();
+ // Status records have this form:
+ // Count N <-- fLastRuleStatusIndex points here.
+ // Status val 0
+ // Status val 1
+ // ...
+ // Status val N-1 <-- the value we need to return
+ // The status values are sorted in ascending order.
+ // This function returns the last (largest) of the array of status values.
+ int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
+ int tagVal = fRData.fStatusTable[idx];
+ return tagVal;
+ }
- * Get the status (tag) values from the break rule(s) that determined the most
- * recently returned break position. The values appear in the rule source
- * within brackets, {123}, for example. The default status value for rules
- * that do not explicitly provide one is zero.
- * <p>
- * The status values used by the standard ICU break rules are defined
- * as public constants in class RuleBasedBreakIterator.
- * <p>
- * If the size of the output array is insufficient to hold the data,
- * the output will be truncated to the available length. No exception
- * will be thrown.
- *
- * @param fillInArray an array to be filled in with the status values.
- * @return The number of rule status values from rules that determined
- * the most recent boundary returned by the break iterator.
- * In the event that the array is too small, the return value
- * is the total number of status values that were available,
- * not the reduced number that were actually returned.
- * @draft ICU 3.0
- * @provisional This is a draft API and might change in a future release of ICU.
- */
-public int getRuleStatusVec(int[] fillInArray) {
- makeRuleStatusValid();
- int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
- if (fillInArray != null) {
- int numToCopy = Math.min(numStatusVals, fillInArray.length);
- for (int i=0; i<numToCopy; i++) {
- fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
+ /**
+ * Get the status (tag) values from the break rule(s) that determined the most
+ * recently returned break position. The values appear in the rule source
+ * within brackets, {123}, for example. The default status value for rules
+ * that do not explicitly provide one is zero.
+ * <p>
+ * The status values used by the standard ICU break rules are defined
+ * as public constants in class RuleBasedBreakIterator.
+ * <p>
+ * If the size of the output array is insufficient to hold the data,
+ * the output will be truncated to the available length. No exception
+ * will be thrown.
+ *
+ * @param fillInArray an array to be filled in with the status values.
+ * @return The number of rule status values from rules that determined
+ * the most recent boundary returned by the break iterator.
+ * In the event that the array is too small, the return value
+ * is the total number of status values that were available,
+ * not the reduced number that were actually returned.
+ * @draft ICU 3.0
+ * @provisional This is a draft API and might change in a future release of ICU.
+ */
+ public int getRuleStatusVec(int[] fillInArray) {
+ makeRuleStatusValid();
+ int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
+ if (fillInArray != null) {
+ int numToCopy = Math.min(numStatusVals, fillInArray.length);
+ for (int i=0; i<numToCopy; i++) {
+ fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
+ }
+ return numStatusVals;
- return numStatusVals;
- }
- * Return a CharacterIterator over the text being analyzed. This version
- * of this method returns the actual CharacterIterator we're using internally.
- * Changing the state of this iterator can have undefined consequences. If
- * you need to change it, clone it first.
- * @return An iterator over the text being analyzed.
- * @stable ICU 2.0
- */
+ /**
+ * Return a CharacterIterator over the text being analyzed. This version
+ * of this method returns the actual CharacterIterator we're using internally.
+ * Changing the state of this iterator can have undefined consequences. If
+ * you need to change it, clone it first.
+ * @return An iterator over the text being analyzed.
+ * @stable ICU 2.0
+ */
public CharacterIterator getText() {
return fText;
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @stable ICU 2.0
public void setText(CharacterIterator newText) {
+ if (newText != null) {
+ if ((fBreakType == KIND_WORD || fBreakType == KIND_LINE)
+ && newText.getEndIndex() != newText.getBeginIndex()) {
+ fUseDictionary = true;
+ } else {
+ fUseDictionary = false;
+ }
+ }
fText = newText;
+ fCachedBreakPositions = null;
+ fDictionaryCharCount = 0;
+ fPositionInCache = 0;
- * Control debug, trace and dump options.
* @internal
* @deprecated This API is ICU internal only.
- protected static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
- ICUDebug.value(RBBI_DEBUG_ARG) : null;
+ void setBreakType(int type) {
+ fBreakType = type;
+ if (type != KIND_WORD && type != KIND_LINE) {
+ fUseDictionary = false;
+ }
+ }
- // 32 bit Char value returned from when an iterator has run out of range.
- // Positive value so fast case (not end, not surrogate) can be checked
- // with a single test.
- private static int CI_DONE32 = 0x7fffffff;
+ /**
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ int getBreakType() {
+ return fBreakType;
+ }
+ /**
+ * Control debug, trace and dump options.
+ * @internal
+ */
+ static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
+ ICUDebug.value(RBBI_DEBUG_ARG) : null;
- * Move the iterator forward to the next code point, and return that code point,
- * leaving the iterator positioned at char returned.
- * For Supplementary chars, the iterator is left positioned at the lead surrogate.
- * @param ci The character iterator
- * @return The next code point.
+ * Finds an appropriate LanguageBreakEngine for this character and
+ * break type.
+ * @internal
+ * @deprecated This API is ICU internal only.
- static int CINext32(CharacterIterator ci) {
- // If the current position is at a surrogate pair, move to the trail surrogate
- // which leaves it in positon for underlying iterator's next() to work.
- int c= ci.current();
- c = ci.next();
- c = ci.previous();
- }
+ protected LanguageBreakEngine getEngineFor(int c) {
+ if (c == DONE32 || !fUseDictionary) {
+ return null;
- // For BMP chars, this next() is the real deal.
- c = ci.next();
- // If we might have a lead surrogate, we need to peak ahead to get the trail
- // even though we don't want to really be positioned there.
- c = CINextTrail32(ci, c);
- }
- if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
- // We got a supplementary char. Back the iterator up to the postion
- // of the lead surrogate.
- ci.previous();
+ for (LanguageBreakEngine candidate : fBreakEngines) {
+ if (candidate.handles(c, fBreakType)) {
+ return candidate;
+ }
- return c;
- }
- // Out-of-line portion of the in-line Next32 code.
- // The call site does an initial ci.next() and calls this function
- // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
- // NOTE: we leave the underlying char iterator positioned in the
- // middle of a surroage pair. ci.next() will work correctly
- // from there, but the ci.getIndex() will be wrong, and needs
- // adjustment.
- private static int CINextTrail32(CharacterIterator ci, int lead) {
- int retVal = lead;
- char cTrail = ci.next();
- if (UTF16.isTrailSurrogate(cTrail)) {
- retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
- } else {
- ci.previous();
- }
- } else {
- if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
- retVal = CI_DONE32;
+ // if we don't have an existing engine, build one.
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ LanguageBreakEngine eng = null;
+ try {
+ switch (script) {
+ case UScript.THAI:
+ eng = new ThaiBreakEngine();
+ break;
+ case UScript.KATAKANA:
+ case UScript.HIRAGANA:
+ case UScript.HAN:
+ if (getBreakType() == KIND_WORD)
+ eng = new CjkBreakEngine(false);
+ break;
+ case UScript.HANGUL:
+ if (getBreakType() == KIND_WORD)
+ eng = new CjkBreakEngine(true);
+ break;
+ default:
+ fUnhandledBreakEngine.handleChar(c, getBreakType());
+ eng = fUnhandledBreakEngine;
+ break;
+ } catch (IOException e) {
+ eng = null;
- return retVal;
- }
- private static int CIPrevious32(CharacterIterator ci) {
- if (ci.getIndex() <= ci.getBeginIndex()) {
- return CI_DONE32;
- }
- char trail = ci.previous();
- int retVal = trail;
- if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
- char lead = ci.previous();
- if (UTF16.isLeadSurrogate(lead)) {
- retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
- ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
- } else {
- ci.next();
- }
+ if (eng != null) {
+ fBreakEngines.add(eng);
- return retVal;
+ return eng;
- static int CICurrent32(CharacterIterator ci) {
- char lead = ci.current();
- int retVal = lead;
- return retVal;
- }
- if (UTF16.isLeadSurrogate(lead)) {
- int trail = (int)ci.next();
- ci.previous();
- if (UTF16.isTrailSurrogate((char)trail)) {
- retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
- }
- } else {
- if (lead == CharacterIterator.DONE) {
- if (ci.getIndex() >= ci.getEndIndex()) {
- retVal = CI_DONE32;
- }
- }
- }
- return retVal;
- }
// handleNext(void) All forward iteration vectors through this function.
- // NOTE: This function is overridden by the dictionary base break iterator.
- // User level API functions go to the dbbi implementation
- // when the break iterator type is dbbi.
- // The DBBI implementation sometimes explicitly calls back to here,
- // its inherited handleNext().
int handleNext() {
- return handleNext(fRData.fFTable);
+ CharacterIterator text = getText();
+ // if there are no cached break positions, or if we've just moved
+ // off the end of the range covered by the cache, we have to dump
+ // and possibly regenerate the cache
+ int startPos = text.getIndex();
+ if (fCachedBreakPositions == null || fPositionInCache == fCachedBreakPositions.length - 1) {
+ // start by using the rules handleNext() to find a tentative return
+ // value. dictionaryCharCount tells us how many dictionary characters
+ // we passed over on our way to the tentative return value
+ fDictionaryCharCount = 0;
+ int result = handleNext(fRData.fFTable);
+ // if we passed over more than one dictionary character, then we use
+ // divideUpDictionaryRange() to regenerate the cached break positions
+ // for the new range.
+ if (fDictionaryCharCount > 1 && result - startPos > 1) {
+ text.setIndex(startPos);
+ LanguageBreakEngine e = getEngineFor(current32(text));
+ if (e != null) {
+ // we have an engine! use it to produce breaks
+ Stack<Integer> breaks = new Stack<Integer>();
+ e.findBreaks(text, startPos, result, false, getBreakType(), breaks);
+ fCachedBreakPositions = new int[breaks.size() + 2];
+ fCachedBreakPositions[0] = startPos;
+ for (int i = 0; i < breaks.size(); i++) {
+ fCachedBreakPositions[i + 1] = breaks.elementAt(i).intValue();
+ }
+ fCachedBreakPositions[breaks.size() + 1] = result;
+ fPositionInCache = 0;
+ } else {
+ // we don't have an engine; just use the rules
+ text.setIndex(result);
+ return result;
+ }
+ }
+ else {
+ // otherwise, the value we got back from the inherited function
+ // is our return value, and we can dump the cache
+ fCachedBreakPositions = null;
+ return result;
+ }
+ }
+ // if the cache of break positions has been regenerated (or existed all
+ // along), then just advance to the next break position in the cache
+ // and return it
+ if (fCachedBreakPositions != null) {
+ ++fPositionInCache;
+ text.setIndex(fCachedBreakPositions[fPositionInCache]);
+ return fCachedBreakPositions[fPositionInCache];
+ }
+ Assert.assrt(false);
+ return -9999; // WE SHOULD NEVER GET HERE!
result = initialPosition;
c = fText.current();
- c = CINextTrail32(fText, c);
- if (c == CI_DONE32) {
+ c = nextTrail32(fText, c);
+ if (c == DONE32) {
fLastRuleStatusIndex = 0;
return BreakIterator.DONE;
mode = RBBI_START;
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
- if (c == CI_DONE32) {
+ if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END) {
// We have already run the loop one last time with the
// Ran off end, no match found.
// move forward one
- CINext32(fText);
+ next32(fText);
if (mode == RBBI_RUN) {
c = (int)fText.next();
- c = CINextTrail32(fText, c);
+ c = nextTrail32(fText, c);
} else {
if (mode == RBBI_START) {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case
result = fText.getIndex();
- if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
+ if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
// The iterator has been left in the middle of a surrogate pair.
// We want the start of it.
lookaheadResult = fText.getIndex();
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
// at least one character.)
if (result == initialPosition) {
result = fText.setIndex(initialPosition);
- CINext32(fText);
+ next32(fText);
result = fText.getIndex();
return result;
private int handlePrevious(short stateTable[]) {
if (fText == null || stateTable == null) {
return 0;
// set up the starting char
initialPosition = fText.getIndex();
result = initialPosition;
- c = CIPrevious32(fText);
+ c = previous32(fText);
// Set up the initial state for the state machine
state = START_STATE;
mainLoop: for (;;) {
innerBlock: {
- if (c == CI_DONE32) {
+ if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
- CIPrevious32(fText);
+ previous32(fText);
break mainLoop;
// time.
result = lookaheadResult;
lookaheadStatus = 0;
- // TODO: make a standalone hard break in a rule work.
+ // TODO: make a stand-alone hard break in a rule work.
if (lookAheadHardBreak) {
break mainLoop;
// then move iterator position backwards one character
if (mode == RBBI_RUN) {
- c = CIPrevious32(fText);
+ c = previous32(fText);
} else {
if (mode == RBBI_START) {
mode = RBBI_RUN;
// at least one character.)
if (result == initialPosition) {
result = fText.setIndex(initialPosition);
- CIPrevious32(fText);
+ previous32(fText);
result = fText.getIndex();
return result;
- //-------------------------------------------------------------------------------
- //
- // isDictionaryChar Return true if the category lookup for this char
- // indicates that it is in the set of dictionary lookup
- // chars.
- //
- // This function is intended for use by dictionary based
- // break iterators.
- //
- //-------------------------------------------------------------------------------
- boolean isDictionaryChar(int c) {
- short category = (short) fRData.fTrie.getCodePointValue(c);
- return (category & 0x4000) != 0;
- }
- * Copyright (C) 1996-2011, International Business Machines Corporation and *
+ * Copyright (C) 2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
package com.ibm.icu.text;
import java.io.IOException;
-import java.io.InputStream;
import java.text.CharacterIterator;
import java.util.Stack;
-import com.ibm.icu.impl.Assert;
-class ThaiBreakIterator extends DictionaryBasedBreakIterator {
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UScript;
+public class ThaiBreakEngine implements LanguageBreakEngine {
/* Helper class for improving readability of the Thai word break
* algorithm.
//list of word candidate lengths, in increasing length order
private int lengths[];
private int count[]; // Count of candidates
- private int prefix; // The longeset match with a dictionary word
+ private int prefix; // The longest match with a dictionary word
private int offset; // Offset in the text of these candidates
private int mark; // The preferred candidate's offset
private int current; // The candidate we're currently looking at
// Fill the list of candidates if needed, select the longest, and return the number found
- public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) {
+ public int candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) {
int start = fIter.getIndex();
if (start != offset) {
offset = start;
return lengths[mark];
- // Backup from the current candidate to the next shorter one; rreturn true if that exists
+ // Backup from the current candidate to the next shorter one; return true if that exists
// and point the text after it
public boolean backUp(CharacterIterator fIter) {
if (current > 0) {
mark = current;
- private static UnicodeSet fThaiWordSet;
- private static UnicodeSet fEndWordSet;
- private static UnicodeSet fBeginWordSet;
- private static UnicodeSet fSuffixSet;
- private static UnicodeSet fMarkSet;
- private BreakCTDictionary fDictionary;
// Constants for ThaiBreakIterator
// How many words in a row are "good enough"?
private static final byte THAI_LOOKAHEAD = 3;
private static final char THAI_MAIYAMOK = 0x0E46;
// Minimum word size
private static final byte THAI_MIN_WORD = 2;
- // Minimum number of characters for two words
- //private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
+ private DictionaryMatcher fDictionary;
+ private static UnicodeSet fThaiWordSet;
+ private static UnicodeSet fEndWordSet;
+ private static UnicodeSet fBeginWordSet;
+ private static UnicodeSet fSuffixSet;
+ private static UnicodeSet fMarkSet;
static {
// Initialize UnicodeSets
fThaiWordSet = new UnicodeSet();
- public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException {
- super(ruleStream);
- // Initialize diciontary
- fDictionary = new BreakCTDictionary(dictionaryStream);
+ public ThaiBreakEngine() throws IOException {
+ // Initialize dictionary
+ fDictionary = DictionaryData.loadDictionaryFor("Thai");
- /**
- * This is the implementation function for next().
- */
- protected int handleNext() {
- CharacterIterator text = getText();
- // if there are no cached break positions, or if we've just moved
- // off the end of the range covered by the cache, we have to dump
- // and possibly regenerate the cache
- if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
- // start by using the inherited handleNext() to find a tentative return
- // value. dictionaryCharCount tells us how many dictionary characters
- // we passed over on our way to the tentative return value
- int startPos = text.getIndex();
- fDictionaryCharCount = 0;
- int result = super.handleNext();
- // if we passed over more than one dictionary character, then we use
- // divideUpDictionaryRange() to regenerate the cached break positions
- // for the new range
- if (fDictionaryCharCount > 1 && result - startPos > 1) {
- divideUpDictionaryRange(startPos, result);
- }
- // otherwise, the value we got back from the inherited fuction
- // is our return value, and we can dump the cache
- else {
- cachedBreakPositions = null;
- return result;
- }
- }
- // if the cache of break positions has been regenerated (or existed all
- // along), then just advance to the next break position in the cache
- // and return it
- if (cachedBreakPositions != null) {
- ++positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return cachedBreakPositions[positionInCache];
+ public boolean handles(int c, int breakType) {
+ if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ return (script == UScript.THAI);
- Assert.assrt(false);
- return -9999; // SHOULD NEVER GET HERE!
+ return false;
- /**
- * Divide up a range of known dictionary characters.
- *
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @return The number of breaks found
- */
- private int divideUpDictionaryRange(int rangeStart, int rangeEnd) {
+ public int findBreaks(CharacterIterator fIter, int rangeStart, int rangeEnd, boolean reverse, int breakType,
+ Stack<Integer> foundBreaks) {
if ((rangeEnd - rangeStart) < THAI_MIN_WORD) {
- return 0; // Not enough chacters for word
+ return 0; // Not enough characters for word
- CharacterIterator fIter = getText();
int wordsFound = 0;
int wordLength;
int current;
- Stack<Integer> foundBreaks = new Stack<Integer>();
PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
for (int i = 0; i < THAI_LOOKAHEAD; i++) {
words[i] = new PossibleWord();
wordsFound += 1;
- // If there was more than one, see which one can take use forward the most words
+ // If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
boolean foundBest = false;
// If we're already at the end of the range, we're done
} while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest);
- /* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
+ wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
wordsFound += 1;
// We come here after having either found a word or not. We look ahead to the
// next word. If it's not a dictionary word, we will combine it with the word we
// just found (if there is one), but only if the preceding word does not exceed
// two characters after uc were not 0x0E4C THANTHAKHAT before
// checking the dictionary. That is just a performance filter,
// but it's not clear it's faster than checking the trie
- int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
- fIter.setIndex(current+wordLength+chars);
+ int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
+ fIter.setIndex(current + wordLength + chars);
if (candidate > 0) {
pc = uc;
- // Bump the word cound if there wasn't already one
+ // Bump the word count if there wasn't already one
if (wordLength <= 0) {
wordsFound += 1;
} else {
- fIter.setIndex(current+wordLength);
+ fIter.setIndex(current + wordLength);
// Did we find a word on this iteration? If so, push it on the break stack
if (wordLength > 0) {
- foundBreaks.push(Integer.valueOf(current+wordLength));
+ foundBreaks.push(Integer.valueOf(current + wordLength));
wordsFound -= 1;
- // Store the break points in cachedBreakPositions.
- cachedBreakPositions = new int[foundBreaks.size() + 2];
- cachedBreakPositions[0] = rangeStart;
- int i;
- for (i = 0; i < foundBreaks.size(); i++) {
- cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue();
- }
- cachedBreakPositions[i + 1] = rangeEnd;
- positionInCache = 0;
return wordsFound;
--- /dev/null
+ *******************************************************************************
+ * Copyright (C) 2012, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+import java.text.CharacterIterator;
+import java.util.Stack;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import static com.ibm.icu.impl.CharacterIteration.*;
+public final class UnhandledBreakEngine implements LanguageBreakEngine {
+ // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
+ // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
+ private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
+ public UnhandledBreakEngine() {
+ for (int i = 0; i < fHandled.length; i++) {
+ fHandled[i] = new UnicodeSet();
+ }
+ }
+ public boolean handles(int c, int breakType) {
+ return (breakType >= 0 && breakType < fHandled.length) &&
+ (fHandled[breakType].contains(c));
+ }
+ public int findBreaks(CharacterIterator text, int startPos, int endPos,
+ boolean reverse, int breakType, Stack<Integer> foundBreaks) {
+ text.setIndex(endPos);
+ return 0;
+ }
+ public synchronized void handleChar(int c, int breakType) {
+ if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
+ if (!fHandled[breakType].contains(c)) {
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
+ }
+ }
+ }
version https://git-lfs.github.com/spec/v1
-oid sha256:a243a8584459d751b33c922f2fbfaea27200721a1a27661b5fa2ec96bb5fc6e2
-size 7929565
+oid sha256:23641fd85dfa40f916a7a5b47a6dc8ebd591862a9fe2d62ddcd46b7f1a862d36
+size 9286396
version https://git-lfs.github.com/spec/v1
-oid sha256:fc6ebf5e136b448a03a7e74463c67d96217cc9f9d3feed4d2aa7f74dc5e25e63
+oid sha256:e951e7a3cc20e7126326db97e92ce533db611fde39c201795680246fde86c8e0
size 97666
version https://git-lfs.github.com/spec/v1
-oid sha256:2029b2752b52d544749fffea9b2574ddfd19ea278cf5f26243efd98bd3f15313
-size 719725
+oid sha256:54eeee6d7834231edb7d2d9bd3174d3c4347c737f556bc6b25915bb6860b6fe2
+size 719912
- * Copyright (C) 1996-2010, International Business Machines Corporation and *
+ * Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
package com.ibm.icu.dev.test.rbbi;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
import java.text.StringCharacterIterator;
import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
public class BreakIteratorTest extends TestFmwk
errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
- /*
- * Tests the constructors public DictionaryBasedBreakIterator(String rules, ... public
- * DictionaryBasedBreakIterator(InputStream compiledRules, ...
- */
- public void TestDictionaryBasedBreakIterator() throws IOException {
- // The following class allows the testing of the constructor
- // public DictionaryBasedBreakIterator(String rules, ...
- class TestDictionaryBasedBreakIterator extends DictionaryBasedBreakIterator {
- public TestDictionaryBasedBreakIterator(InputStream is) throws IOException {
- super("", is);
- }
- }
- try {
- @SuppressWarnings("unused")
- TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(null);
- errln("DictionaryBasedBreakIterator constructor is suppose to return an "
- + "exception for an empty string.");
- } catch (Exception e) {
- }
- try {
- File file = File.createTempFile("dummy", "");
- FileInputStream fis = new FileInputStream(file);
- DataInputStream dis = new DataInputStream(fis);
- @SuppressWarnings("unused")
- TestDictionaryBasedBreakIterator td = new TestDictionaryBasedBreakIterator(dis);
- errln("DictionaryBasedBreakIterator constructor is suppose to return an "
- + "exception for a temporary file with EOF.");
- } catch (Exception e) {
- }
- // The following class allows the testing of the constructor
- // public DictionaryBasedBreakIterator(InputStream compiledRules, ...
- class TestDictionaryBasedBreakIterator1 extends DictionaryBasedBreakIterator {
- public TestDictionaryBasedBreakIterator1() throws IOException {
- super((InputStream) null, (InputStream) null);
- }
- }
- try {
- @SuppressWarnings("unused")
- TestDictionaryBasedBreakIterator1 td1 = new TestDictionaryBasedBreakIterator1();
- errln("DictionaryBasedBreakIterator constructor is suppose to return an "
- + "exception for an null input stream.");
- } catch (Exception e) {
- }
- }
\ No newline at end of file
- * Copyright (C) 1996-2011, International Business Machines Corporation and
+ * Copyright (C) 1996-2012, International Business Machines Corporation and
* others. All Rights Reserved.
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
errln("Incorrect following position.");
int []fillInArray = new int[2];
- if (((DictionaryBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
+ if (((RuleBasedBreakIterator)brk).getRuleStatusVec(fillInArray) != 1 || fillInArray[0] != 0) {
errln("Error: Since getRuleStatusVec is not supported in DictionaryBasedBreakIterator, it should return 1 and fillInArray[0] == 0.");
final String posxWordText = "Can't have breaks in xx:yy or struct.field for CS-types.";
final int[] posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
- // KIND_WORD "ja"
- final String jaWordText = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" +
- "\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002";
- final int[] jaWordTOffsets = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 };
- final int[] jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
"\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";
final TBItem[] tests = {
new TBItem( BreakIterator.KIND_WORD, new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ),
new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, posxWordText, posxWordROffsets ),
- new TBItem( BreakIterator.KIND_WORD, new ULocale("ja"), jaWordText, jaWordTOffsets ),
- new TBItem( BreakIterator.KIND_WORD, ULocale.ROOT, jaWordText, jaWordROffsets ),
new TBItem( BreakIterator.KIND_SENTENCE, new ULocale("el"), elSentText, elSentTOffsets ),
new TBItem( BreakIterator.KIND_SENTENCE, ULocale.ROOT, elSentText, elSentROffsets ),
new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"), thCharText, thCharTOffsets ),
public void TestExtended() {
TestParams tp = new TestParams();
// Run the iterator backwards, verify that the same breaks are found.
- * Copyright (C) 2003-2011 International Business Machines Corporation and
+ * Copyright (C) 2003-2012 International Business Machines Corporation and
* others. All Rights Reserved.
UnicodeSet fExtendSet;
UnicodeSet fExtendNumLetSet;
UnicodeSet fOtherSet;
+ UnicodeSet fDictionaryCjkSet;
RBBIWordMonkey() {
fCharProperty = UProperty.WORD_BREAK;
+ fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]");
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+ fALetterSet.removeAll(fDictionaryCjkSet);
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
// Inhibit dictionary characters from being tested at all.
fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
+ fOtherSet.removeAll(fDictionaryCjkSet);
fSets = new ArrayList();
- fSets.add(fKatakanaSet);
+ //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
* return the index of the next code point in the input text.
* @param i the preceding index
- * @return
static int nextCP(StringBuffer s, int i) {
if (i == -1) {
- * Copyright (C) 1996-2006, International Business Machines Corporation and *
+ * Copyright (C) 1996-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
package com.ibm.icu.dev.test.rbbi;
-import java.io.IOException;
-import java.io.InputStream;
import java.util.ListResourceBundle;
-import java.util.MissingResourceException;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
// TODO: {dlf} this test currently doesn't test anything!
"Character", "Word", "Line", "Sentence"
String rulesName = kindNames[kind] + "BreakRules";
- String dictionaryName = kindNames[kind] + "BreakDictionary";
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
String rules = bundle.getString(rulesName);
if (classNames[kind].equals("RuleBasedBreakIterator")) {
iter = new RuleBasedBreakIterator(rules);
- else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
- try {
- String dictionaryPath = bundle.getString(dictionaryName);
- InputStream dictionary = bundle.getClass().getResourceAsStream(dictionaryPath);
- System.out.println("looking for " + dictionaryPath + " from " + bundle.getClass() + " returned " + dictionary);
- iter = new DictionaryBasedBreakIterator(rules, dictionary);
- }
- catch(IOException e) {
- e.printStackTrace();
- errln(e.getMessage());
- System.out.println(e); // debug
- }
- catch(MissingResourceException e) {
- errln(e.getMessage());
- System.out.println(e); // debug
- }
- }
if (iter == null) {
errln("could not create iterator");
# Temp debugging tests
-<locale en>
-<data>•Hello, •World.•</data>
<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
# Hiragana & Katakana stay together, but separates from each other and Latin.
+# *** what to do about theoretical combos of chars? i.e. hiragana + accent
+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
+# Testing of word boundary for dictionary word containing both kanji and kana
# Words with interior formatting characters
# to test for bug #4097779
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
# to test for bug #4098467
# What follows is a string of Korean characters (I found it in the Yellow Pages
# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
# precomposed syllables...
<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
-<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
+# more Korean tests (Jamo not tested here, not counted as dictionary characters)
+# Disable them now because we don't include a Korean dictionary.
+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
+#<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
# Try some words from other scripts.
<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
# conjoining jamo...
-# TODO: rules update needed
-#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
# Test data originally from the test code source file
# // @suwit -- Thai sample data from GVT Guideline
-## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
-#<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
+# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
+<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
# @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
-#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
-#2545 •\
-#\u0E23\u0E2D\u0E1A •\
-#\"\u0E52\u0E52\u0E50 •\
-#\u0E1b\u0E35\" •\
-#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
-#\u0E2B\u0E23\u0E37\u0E2D •\
+\u0E1E\u0E38\u0E17\u0E18•\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
+2545 •\
+\u0E23\u0E2D\u0E1A •\
+\"\u0E52\u0E52\u0E50 •\
+\u0E1b\u0E35\" •\
+\u0E23\u0E31\u0E15\u0E19•\u0E42\u0E01•\u0E2A\u0E34•\u0E19\u0E17\u0E23\u0E4C •\
+\u0E2B\u0E23\u0E37\u0E2D •\
# Data originally from RBBITest::TestMaiyamok()
# The Thai maiyamok character is a shorthand symbol that means "repeat the previous
-# Khmer Tests
-# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
-# from the file testdata/wordsegments.txt
-<locale th>
-# Jitterbug 3671 Test Case
-#<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
-# Trac ticket 5595 Test Case
# Tailored (locale specific) breaking.
<locale ja>
<locale en>
# The following data was originally in RBBITest::TestJapaneseWordBreak()
<locale ja>
# UBreakIteratorType UBRK_WORD, Locale "ja"
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
+# modified to work with dbbi code - should verify
<locale ja>
<locale root>
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
# Add break after Greek question mark (cldrbug #2069).
errln("Did not get the expected output for referencingalias");
- {
- rb = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
- sub = rb.get("boundaries");
- String word = sub.getString("word");
- if(word.equals("word_ja.brk")){
- logln("Got the expected output for boundaries/word");
- }else{
- errln("Did not get the expected type for boundaries/word");
- }
- }
UResourceBundle rb1 = (UResourceBundle)UResourceBundle.getBundleInstance("com/ibm/icu/dev/data/testdata","testaliases",testLoader);
- public void TestBreakIterator() {
- checkService("ja_JP_OSAKA", new ServiceFacade() {
- public Object create(ULocale req) {
- return BreakIterator.getWordInstance(req);
- }
- }, null, new Registrar() {
- public Object register(ULocale loc, Object prototype) {
- return BreakIterator.registerInstance(
- (BreakIterator) prototype,
- loc, BreakIterator.KIND_WORD);
- }
- public boolean unregister(Object key) {
- return BreakIterator.unregister(key);
- }
- });
- }
public void TestDateFormat() {
checkService("de_CH_ZURICH", new ServiceFacade() {
public Object create(ULocale req) {