From: Andy Heninger Date: Tue, 17 Jan 2017 23:13:25 +0000 (+0000) Subject: ICU-12873 Race Conditions in RuleBasedBreakIterator. X-Git-Tag: milestone-59-0-1~27 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=82dfd8b26e4aa9da1e37df4c7ac97b49ddea249f;p=icu ICU-12873 Race Conditions in RuleBasedBreakIterator. X-SVN-Rev: 39572 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 51f17bd99e5..df0f6fe9c76 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -20,7 +20,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.text.CharacterIterator; -import java.util.concurrent.ConcurrentHashMap; +import java.util.ArrayList; +import java.util.List; import com.ibm.icu.impl.Assert; import com.ibm.icu.impl.CharTrie; @@ -48,7 +49,9 @@ public class RuleBasedBreakIterator extends BreakIterator { private RuleBasedBreakIterator() { fLastStatusIndexValid = true; fDictionaryCharCount = 0; - fBreakEngines.put(-1, fUnhandledBreakEngine); + synchronized(gAllBreakEngines) { + fBreakEngines = new ArrayList(gAllBreakEngines); + } } /** @@ -134,6 +137,13 @@ public class RuleBasedBreakIterator extends BreakIterator { if (fText != null) { result.fText = (CharacterIterator)(fText.clone()); } + synchronized (gAllBreakEngines) { + result.fBreakEngines = new ArrayList(gAllBreakEngines); + } + result.fLookAheadMatches = new LookAheadResults(); + if (fCachedBreakPositions != null) { + result.fCachedBreakPositions = fCachedBreakPositions.clone(); + } return result; } @@ -260,10 +270,34 @@ public class RuleBasedBreakIterator extends BreakIterator { * The "default" break engine - just skips over ranges of dictionary words, * producing no breaks. Should only be used if characters need to be handled * by a dictionary but we have no dictionary implementation for them. + * + * Only one instance; shared by all break iterators. */ - private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); + private static final UnhandledBreakEngine gUnhandledBreakEngine; + + /** + * List of all known break engines, common for all break iterators. + * Lazily updated as break engines are needed, because instantiation of + * break engines is expensive. + * + * Because gAllBreakEngines can be referenced concurrently from different + * BreakIterator instances, all access is synchronized. + */ + private static final List gAllBreakEngines; + + static { + gUnhandledBreakEngine = new UnhandledBreakEngine(); + gAllBreakEngines = new ArrayList(); + gAllBreakEngines.add(gUnhandledBreakEngine); + } /** + * List of all known break engines. Similar to gAllBreakEngines, but local to a + * break iterator, allowing it to be used without synchronization. + */ + private List fBreakEngines; + + /** * when a range of characters is divided up using the dictionary, the break * positions that are discovered are stored here, preventing us from having * to use either the dictionary or the state table again until the iterator @@ -277,9 +311,6 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private int fPositionInCache; - - private final ConcurrentHashMap fBreakEngines = - new ConcurrentHashMap(); /** * Dumps caches and performs other actions associated with a complete change * in text or iteration position. @@ -1107,26 +1138,32 @@ public class RuleBasedBreakIterator extends BreakIterator { // We have a dictionary character. // Does an already instantiated break engine handle it? - for (LanguageBreakEngine candidate : fBreakEngines.values()) { + for (LanguageBreakEngine candidate : fBreakEngines) { if (candidate.handles(c, fBreakType)) { return candidate; } } - // if we don't have an existing engine, build one. - int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); - if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { - // Katakana, Hiragana and Han are handled by the same dictionary engine. - // Fold them together for mapping from script -> engine. - script = UScript.HAN; - } + synchronized (gAllBreakEngines) { + // This break iterator's list of break engines didn't handle the character. + // Check the global list, another break iterator may have instantiated the + // desired engine. + for (LanguageBreakEngine candidate : gAllBreakEngines) { + if (candidate.handles(c, fBreakType)) { + fBreakEngines.add(candidate); + return candidate; + } + } + + // The global list doesn't have an existing engine, build one. + int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); + if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { + // Katakana, Hiragana and Han are handled by the same dictionary engine. + // Fold them together for mapping from script -> engine. + script = UScript.HAN; + } - LanguageBreakEngine eng = fBreakEngines.get(script); - /* - if (eng != null && !eng.handles(c, fBreakType)) { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; - } else */ { + LanguageBreakEngine eng; try { switch (script) { case UScript.THAI: @@ -1146,38 +1183,33 @@ public class RuleBasedBreakIterator extends BreakIterator { eng = new CjkBreakEngine(false); } else { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; } break; case UScript.HANGUL: if (getBreakType() == KIND_WORD) { eng = new CjkBreakEngine(true); } else { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; } break; default: - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; break; } } catch (IOException e) { eng = null; } - } - if (eng != null && eng != fUnhandledBreakEngine) { - LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng); - if (existingEngine != null) { - // There was a race & another thread was first to register an engine for this script. - // Use theirs and discard the one we just created. - eng = existingEngine; + if (eng != null && eng != gUnhandledBreakEngine) { + gAllBreakEngines.add(eng); + fBreakEngines.add(eng); } - // assert eng.handles(c, fBreakType); - } - return eng; + return eng; + } // end synchronized(gAllBreakEngines) } private static final int kMaxLookaheads = 8; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java index f5d7a355948..642d0965cca 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java @@ -11,6 +11,7 @@ package com.ibm.icu.text; import static com.ibm.icu.impl.CharacterIteration.DONE32; import java.text.CharacterIterator; +import java.util.concurrent.atomic.AtomicReferenceArray; import com.ibm.icu.impl.CharacterIteration; import com.ibm.icu.lang.UCharacter; @@ -19,42 +20,70 @@ import com.ibm.icu.lang.UProperty; final class UnhandledBreakEngine implements LanguageBreakEngine { // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen. // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one. - private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1]; + + // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all + // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls. + // If handleChar() is updating the set of unhandled characters at the same time + // findBreaks() or handles() is referencing it, the referencing functions must see + // a consistent set. It doesn't matter whether they see it before or after the update, + // but they should not see an inconsistent, changing set. + // + // To do this, an update is made by cloning the old set, updating the clone, then + // replacing the old with the new. Once made visible, each set remains constant. + + // TODO: it's odd that findBreaks() can produce different results, depending + // on which scripts have been previously seen by handleChar(). (This is not a + // threading specific issue). Possibly stop on script boundaries? + + final AtomicReferenceArray fHandled = new AtomicReferenceArray(BreakIterator.KIND_TITLE + 1); public UnhandledBreakEngine() { - for (int i = 0; i < fHandled.length; i++) { - fHandled[i] = new UnicodeSet(); + for (int i = 0; i < fHandled.length(); i++) { + fHandled.set(i, new UnicodeSet()); } } - + + @Override public boolean handles(int c, int breakType) { - return (breakType >= 0 && breakType < fHandled.length) && - (fHandled[breakType].contains(c)); + return (breakType >= 0 && breakType < fHandled.length()) && + (fHandled.get(breakType).contains(c)); } + @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) { - if (breakType >= 0 && breakType < fHandled.length) { - int c = CharacterIteration.current32(text); - if (reverse) { - while (text.getIndex() > startPos && fHandled[breakType].contains(c)) { - CharacterIteration.previous32(text); - c = CharacterIteration.current32(text); - } - } else { - while (text.getIndex() < endPos && fHandled[breakType].contains(c)) { - CharacterIteration.next32(text); - c = CharacterIteration.current32(text); - } - } - } + if (breakType >= 0 && breakType < fHandled.length()) { + UnicodeSet uniset = fHandled.get(breakType); + int c = CharacterIteration.current32(text); + if (reverse) { + while (text.getIndex() > startPos && uniset.contains(c)) { + CharacterIteration.previous32(text); + c = CharacterIteration.current32(text); + } + } else { + while (text.getIndex() < endPos && uniset.contains(c)) { + CharacterIteration.next32(text); + c = CharacterIteration.current32(text); + } + } + } return 0; } - public synchronized void handleChar(int c, int breakType) { - if (breakType >= 0 && breakType < fHandled.length && c != DONE32) { - if (!fHandled[breakType].contains(c)) { + /** + * Update the set of unhandled characters for the specified breakType to include + * all that have the same script as c. + * May be called concurrently with handles() or findBreaks(). + * Must not be called concurrently with itself. + */ + public void handleChar(int c, int breakType) { + if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) { + UnicodeSet originalSet = fHandled.get(breakType); + if (!originalSet.contains(c)) { int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); - fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script); + UnicodeSet newSet = new UnicodeSet(); + newSet.applyIntPropertyValue(UProperty.SCRIPT, script); + newSet.addAll(originalSet); + fHandled.set(breakType, newSet); } } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index c08b078c364..c66052f91f3 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -851,6 +851,89 @@ public class RBBITest extends TestFmwk { bi.setText("abc"); bi.first(); assertEquals("Rule chaining test", 3, bi.next()); - } } + + @Test + public void TestBug12873() { + // Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead + // matches not being cloned when a break iterator is cloned. This resulted in usage + // collisions if the original break iterator and its clone were used concurrently. + + // The Line Break rules for Regional Indicators make use of look-ahead rules, and + // show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A + // Regional indicators group into pairs, expect breaks after two code points, which + // is after four 16 bit code units. + + final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6"; + final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance(); + final AssertionError[] assertErr = new AssertionError[1]; // saves an error found from within a thread + + class WorkerThread implements Runnable { + @Override + public void run() { + try { + RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone(); + localBI.setText(dataToBreak); + for (int loop=0; loop<100; loop++) { + int nextExpectedBreak = 0; + for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE; + actualBreak = localBI.next(), nextExpectedBreak+= 4) { + assertEquals("", nextExpectedBreak, actualBreak); + } + assertEquals("", dataToBreak.length()+4, nextExpectedBreak); + } + } catch (AssertionError e) { + assertErr[0] = e; + } + } + } + + List threads = new ArrayList(); + for (int n = 0; n<4; ++n) { + threads.add(new Thread(new WorkerThread())); + } + for (Thread thread: threads) { + thread.start(); + } + for (Thread thread: threads) { + try { + thread.join(); + } catch (InterruptedException e) { + fail(e.toString()); + } + } + + // JUnit wont see failures from within the worker threads, so + // check again if one occurred. + if (assertErr[0] != null) { + throw assertErr[0]; + } + } + + @Test + public void TestBreakAllChars() { + // Make a "word" from each code point, separated by spaces. + // For dictionary based breaking, runs the start-of-range + // logic with all possible dictionary characters. + StringBuilder sb = new StringBuilder(); + for (int c=0; c<0x110000; ++c) { + sb.appendCodePoint(c); + sb.appendCodePoint(c); + sb.appendCodePoint(c); + sb.appendCodePoint(c); + sb.append(' '); + } + String s = sb.toString(); + + for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) { + RuleBasedBreakIterator bi = + (RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind); + bi.setText(s); + int lastb = -1; + for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) { + assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b); + } + } + } +}