import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.ArrayList;
+import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
private RuleBasedBreakIterator() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
- fBreakEngines.put(-1, fUnhandledBreakEngine);
+ synchronized(gAllBreakEngines) {
+ fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+ }
}
/**
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
}
+ synchronized (gAllBreakEngines) {
+ result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+ }
+ result.fLookAheadMatches = new LookAheadResults();
+ if (fCachedBreakPositions != null) {
+ result.fCachedBreakPositions = fCachedBreakPositions.clone();
+ }
return result;
}
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
+ *
+ * Only one instance; shared by all break iterators.
*/
- private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
+ private static final UnhandledBreakEngine gUnhandledBreakEngine;
+
+ /**
+ * List of all known break engines, common for all break iterators.
+ * Lazily updated as break engines are needed, because instantiation of
+ * break engines is expensive.
+ *
+ * Because gAllBreakEngines can be referenced concurrently from different
+ * BreakIterator instances, all access is synchronized.
+ */
+ private static final List<LanguageBreakEngine> gAllBreakEngines;
+
+ static {
+ gUnhandledBreakEngine = new UnhandledBreakEngine();
+ gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
+ gAllBreakEngines.add(gUnhandledBreakEngine);
+ }
/**
+ * List of all known break engines. Similar to gAllBreakEngines, but local to a
+ * break iterator, allowing it to be used without synchronization.
+ */
+ private List<LanguageBreakEngine> fBreakEngines;
+
+ /**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
*/
private int fPositionInCache;
-
- private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
- new ConcurrentHashMap<Integer, LanguageBreakEngine>();
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position.
// We have a dictionary character.
// Does an already instantiated break engine handle it?
- for (LanguageBreakEngine candidate : fBreakEngines.values()) {
+ for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c, fBreakType)) {
return candidate;
}
}
- // if we don't have an existing engine, build one.
- int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
- if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
- // Katakana, Hiragana and Han are handled by the same dictionary engine.
- // Fold them together for mapping from script -> engine.
- script = UScript.HAN;
- }
+ synchronized (gAllBreakEngines) {
+ // This break iterator's list of break engines didn't handle the character.
+ // Check the global list, another break iterator may have instantiated the
+ // desired engine.
+ for (LanguageBreakEngine candidate : gAllBreakEngines) {
+ if (candidate.handles(c, fBreakType)) {
+ fBreakEngines.add(candidate);
+ return candidate;
+ }
+ }
+
+ // The global list doesn't have an existing engine, build one.
+ int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+ if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
+ // Katakana, Hiragana and Han are handled by the same dictionary engine.
+ // Fold them together for mapping from script -> engine.
+ script = UScript.HAN;
+ }
- LanguageBreakEngine eng = fBreakEngines.get(script);
- /*
- if (eng != null && !eng.handles(c, fBreakType)) {
- fUnhandledBreakEngine.handleChar(c, getBreakType());
- eng = fUnhandledBreakEngine;
- } else */ {
+ LanguageBreakEngine eng;
try {
switch (script) {
case UScript.THAI:
eng = new CjkBreakEngine(false);
}
else {
- fUnhandledBreakEngine.handleChar(c, getBreakType());
- eng = fUnhandledBreakEngine;
+ gUnhandledBreakEngine.handleChar(c, getBreakType());
+ eng = gUnhandledBreakEngine;
}
break;
case UScript.HANGUL:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(true);
} else {
- fUnhandledBreakEngine.handleChar(c, getBreakType());
- eng = fUnhandledBreakEngine;
+ gUnhandledBreakEngine.handleChar(c, getBreakType());
+ eng = gUnhandledBreakEngine;
}
break;
default:
- fUnhandledBreakEngine.handleChar(c, getBreakType());
- eng = fUnhandledBreakEngine;
+ gUnhandledBreakEngine.handleChar(c, getBreakType());
+ eng = gUnhandledBreakEngine;
break;
}
} catch (IOException e) {
eng = null;
}
- }
- if (eng != null && eng != fUnhandledBreakEngine) {
- LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
- if (existingEngine != null) {
- // There was a race & another thread was first to register an engine for this script.
- // Use theirs and discard the one we just created.
- eng = existingEngine;
+ if (eng != null && eng != gUnhandledBreakEngine) {
+ gAllBreakEngines.add(eng);
+ fBreakEngines.add(eng);
}
- // assert eng.handles(c, fBreakType);
- }
- return eng;
+ return eng;
+ } // end synchronized(gAllBreakEngines)
}
private static final int kMaxLookaheads = 8;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import java.text.CharacterIterator;
+import java.util.concurrent.atomic.AtomicReferenceArray;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.lang.UCharacter;
final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
- private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
+
+ // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
+ // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
+ // If handleChar() is updating the set of unhandled characters at the same time
+ // findBreaks() or handles() is referencing it, the referencing functions must see
+ // a consistent set. It doesn't matter whether they see it before or after the update,
+ // but they should not see an inconsistent, changing set.
+ //
+ // To do this, an update is made by cloning the old set, updating the clone, then
+ // replacing the old with the new. Once made visible, each set remains constant.
+
+ // TODO: it's odd that findBreaks() can produce different results, depending
+ // on which scripts have been previously seen by handleChar(). (This is not a
+ // threading specific issue). Possibly stop on script boundaries?
+
+ final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
public UnhandledBreakEngine() {
- for (int i = 0; i < fHandled.length; i++) {
- fHandled[i] = new UnicodeSet();
+ for (int i = 0; i < fHandled.length(); i++) {
+ fHandled.set(i, new UnicodeSet());
}
}
-
+
+ @Override
public boolean handles(int c, int breakType) {
- return (breakType >= 0 && breakType < fHandled.length) &&
- (fHandled[breakType].contains(c));
+ return (breakType >= 0 && breakType < fHandled.length()) &&
+ (fHandled.get(breakType).contains(c));
}
+ @Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
- if (breakType >= 0 && breakType < fHandled.length) {
- int c = CharacterIteration.current32(text);
- if (reverse) {
- while (text.getIndex() > startPos && fHandled[breakType].contains(c)) {
- CharacterIteration.previous32(text);
- c = CharacterIteration.current32(text);
- }
- } else {
- while (text.getIndex() < endPos && fHandled[breakType].contains(c)) {
- CharacterIteration.next32(text);
- c = CharacterIteration.current32(text);
- }
- }
- }
+ if (breakType >= 0 && breakType < fHandled.length()) {
+ UnicodeSet uniset = fHandled.get(breakType);
+ int c = CharacterIteration.current32(text);
+ if (reverse) {
+ while (text.getIndex() > startPos && uniset.contains(c)) {
+ CharacterIteration.previous32(text);
+ c = CharacterIteration.current32(text);
+ }
+ } else {
+ while (text.getIndex() < endPos && uniset.contains(c)) {
+ CharacterIteration.next32(text);
+ c = CharacterIteration.current32(text);
+ }
+ }
+ }
return 0;
}
- public synchronized void handleChar(int c, int breakType) {
- if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
- if (!fHandled[breakType].contains(c)) {
+ /**
+ * Update the set of unhandled characters for the specified breakType to include
+ * all that have the same script as c.
+ * May be called concurrently with handles() or findBreaks().
+ * Must not be called concurrently with itself.
+ */
+ public void handleChar(int c, int breakType) {
+ if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
+ UnicodeSet originalSet = fHandled.get(breakType);
+ if (!originalSet.contains(c)) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
- fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
+ UnicodeSet newSet = new UnicodeSet();
+ newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
+ newSet.addAll(originalSet);
+ fHandled.set(breakType, newSet);
}
}
}
bi.setText("abc");
bi.first();
assertEquals("Rule chaining test", 3, bi.next());
- }
}
+
+ @Test
+ public void TestBug12873() {
+ // Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
+ // matches not being cloned when a break iterator is cloned. This resulted in usage
+ // collisions if the original break iterator and its clone were used concurrently.
+
+ // The Line Break rules for Regional Indicators make use of look-ahead rules, and
+ // show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
+ // Regional indicators group into pairs, expect breaks after two code points, which
+ // is after four 16 bit code units.
+
+ final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
+ final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
+ final AssertionError[] assertErr = new AssertionError[1]; // saves an error found from within a thread
+
+ class WorkerThread implements Runnable {
+ @Override
+ public void run() {
+ try {
+ RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
+ localBI.setText(dataToBreak);
+ for (int loop=0; loop<100; loop++) {
+ int nextExpectedBreak = 0;
+ for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
+ actualBreak = localBI.next(), nextExpectedBreak+= 4) {
+ assertEquals("", nextExpectedBreak, actualBreak);
+ }
+ assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
+ }
+ } catch (AssertionError e) {
+ assertErr[0] = e;
+ }
+ }
+ }
+
+ List<Thread> threads = new ArrayList<Thread>();
+ for (int n = 0; n<4; ++n) {
+ threads.add(new Thread(new WorkerThread()));
+ }
+ for (Thread thread: threads) {
+ thread.start();
+ }
+ for (Thread thread: threads) {
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ fail(e.toString());
+ }
+ }
+
+ // JUnit wont see failures from within the worker threads, so
+ // check again if one occurred.
+ if (assertErr[0] != null) {
+ throw assertErr[0];
+ }
+ }
+
+ @Test
+ public void TestBreakAllChars() {
+ // Make a "word" from each code point, separated by spaces.
+ // For dictionary based breaking, runs the start-of-range
+ // logic with all possible dictionary characters.
+ StringBuilder sb = new StringBuilder();
+ for (int c=0; c<0x110000; ++c) {
+ sb.appendCodePoint(c);
+ sb.appendCodePoint(c);
+ sb.appendCodePoint(c);
+ sb.appendCodePoint(c);
+ sb.append(' ');
+ }
+ String s = sb.toString();
+
+ for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
+ RuleBasedBreakIterator bi =
+ (RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
+ bi.setText(s);
+ int lastb = -1;
+ for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
+ assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
+ }
+ }
+ }
+}