ICU-12873 Race Conditions in RuleBasedBreakIterator.

author Andy Heninger <andy.heninger@gmail.com>

Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java

index 51f17bd99e5e9e451d29e2a005c042eb2961bced..df0f6fe9c76f03e36ba097be023ba53fdcc71dad 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -20,7 +20,8 @@ import java.io.InputStream;
  import java.io.OutputStream;
  import java.nio.ByteBuffer;
  import java.text.CharacterIterator;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.ArrayList;
+import java.util.List;
  
  import com.ibm.icu.impl.Assert;
  import com.ibm.icu.impl.CharTrie;
@@ -48,7 +49,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
      private RuleBasedBreakIterator() {
          fLastStatusIndexValid = true;
          fDictionaryCharCount  = 0;
-        fBreakEngines.put(-1, fUnhandledBreakEngine);
+        synchronized(gAllBreakEngines) {
+            fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+        }
      }
  
      /**
@@ -134,6 +137,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
          if (fText != null) {
              result.fText = (CharacterIterator)(fText.clone());
          }
+        synchronized (gAllBreakEngines)  {
+            result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+        }
+        result.fLookAheadMatches = new LookAheadResults();
+        if (fCachedBreakPositions != null) {
+            result.fCachedBreakPositions = fCachedBreakPositions.clone();
+        }
          return result;
      }
  
@@ -260,10 +270,34 @@ public class RuleBasedBreakIterator extends BreakIterator {
       * The "default" break engine - just skips over ranges of dictionary words,
       * producing no breaks. Should only be used if characters need to be handled
       * by a dictionary but we have no dictionary implementation for them.
+     *
+     * Only one instance; shared by all break iterators.
       */
-    private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
+    private static final UnhandledBreakEngine gUnhandledBreakEngine;
+
+    /**
+     * List of all known break engines, common for all break iterators.
+     * Lazily updated as break engines are needed, because instantiation of
+     * break engines is expensive.
+     *
+     * Because gAllBreakEngines can be referenced concurrently from different
+     * BreakIterator instances, all access is synchronized.
+     */
+    private static final List<LanguageBreakEngine> gAllBreakEngines;
+
+    static {
+        gUnhandledBreakEngine = new UnhandledBreakEngine();
+        gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
+        gAllBreakEngines.add(gUnhandledBreakEngine);
+    }
  
      /**
+     * List of all known break engines. Similar to gAllBreakEngines, but local to a
+     * break iterator, allowing it to be used without synchronization.
+     */
+    private List<LanguageBreakEngine> fBreakEngines;
+
+  /**
       * when a range of characters is divided up using the dictionary, the break
       * positions that are discovered are stored here, preventing us from having
       * to use either the dictionary or the state table again until the iterator
@@ -277,9 +311,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
       */
      private int fPositionInCache;
  
-
-    private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
-            new ConcurrentHashMap<Integer, LanguageBreakEngine>();
      /**
       * Dumps caches and performs other actions associated with a complete change
       * in text or iteration position.
@@ -1107,26 +1138,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
  
          // We have a dictionary character.
          // Does an already instantiated break engine handle it?
-        for (LanguageBreakEngine candidate : fBreakEngines.values()) {
+        for (LanguageBreakEngine candidate : fBreakEngines) {
              if (candidate.handles(c, fBreakType)) {
                  return candidate;
              }
          }
  
-        // if we don't have an existing engine, build one.
-        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-        if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
-            // Katakana, Hiragana and Han are handled by the same dictionary engine.
-            // Fold them together for mapping from script -> engine.
-            script = UScript.HAN;
-        }
+        synchronized (gAllBreakEngines) {
+            // This break iterator's list of break engines didn't handle the character.
+            // Check the global list, another break iterator may have instantiated the
+            // desired engine.
+            for (LanguageBreakEngine candidate : gAllBreakEngines) {
+                if (candidate.handles(c, fBreakType)) {
+                    fBreakEngines.add(candidate);
+                    return candidate;
+                }
+            }
+
+            // The global list doesn't have an existing engine, build one.
+            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+            if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
+                // Katakana, Hiragana and Han are handled by the same dictionary engine.
+                // Fold them together for mapping from script -> engine.
+                script = UScript.HAN;
+            }
  
-        LanguageBreakEngine eng = fBreakEngines.get(script);
-        /*
-        if (eng != null && !eng.handles(c, fBreakType)) {
-            fUnhandledBreakEngine.handleChar(c, getBreakType());
-            eng = fUnhandledBreakEngine;
-        } else  */  {
+            LanguageBreakEngine eng;
              try {
                  switch (script) {
                  case UScript.THAI:
@@ -1146,38 +1183,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
                          eng = new CjkBreakEngine(false);
                      }
                      else {
-                        fUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = fUnhandledBreakEngine;
+                        gUnhandledBreakEngine.handleChar(c, getBreakType());
+                        eng = gUnhandledBreakEngine;
                      }
                      break;
                  case UScript.HANGUL:
                      if (getBreakType() == KIND_WORD) {
                          eng = new CjkBreakEngine(true);
                      } else {
-                        fUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = fUnhandledBreakEngine;
+                        gUnhandledBreakEngine.handleChar(c, getBreakType());
+                        eng = gUnhandledBreakEngine;
                      }
                      break;
                  default:
-                    fUnhandledBreakEngine.handleChar(c, getBreakType());
-                    eng = fUnhandledBreakEngine;
+                    gUnhandledBreakEngine.handleChar(c, getBreakType());
+                    eng = gUnhandledBreakEngine;
                      break;
                  }
              } catch (IOException e) {
                  eng = null;
              }
-        }
  
-        if (eng != null && eng != fUnhandledBreakEngine) {
-            LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
-            if (existingEngine != null) {
-                // There was a race & another thread was first to register an engine for this script.
-                // Use theirs and discard the one we just created.
-                eng = existingEngine;
+            if (eng != null && eng != gUnhandledBreakEngine) {
+                gAllBreakEngines.add(eng);
+                fBreakEngines.add(eng);
              }
-            // assert eng.handles(c, fBreakType);
-        }
-        return eng;
+            return eng;
+        }   // end synchronized(gAllBreakEngines)
      }
  
      private static final int kMaxLookaheads = 8;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java

index f5d7a3559483cc35f5003bd07f70bb041664a070..642d0965ccae46b87759ee8dc752cfc70f351b6a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
@@ -11,6 +11,7 @@ package com.ibm.icu.text;
  import static com.ibm.icu.impl.CharacterIteration.DONE32;
  
  import java.text.CharacterIterator;
+import java.util.concurrent.atomic.AtomicReferenceArray;
  
  import com.ibm.icu.impl.CharacterIteration;
  import com.ibm.icu.lang.UCharacter;
@@ -19,42 +20,70 @@ import com.ibm.icu.lang.UProperty;
  final class UnhandledBreakEngine implements LanguageBreakEngine {
      // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
      // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
-    private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
+
+    // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
+    // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
+    // If handleChar() is updating the set of unhandled characters at the same time
+    // findBreaks() or handles() is referencing it, the referencing functions must see
+    // a consistent set. It doesn't matter whether they see it before or after the update,
+    // but they should not see an inconsistent, changing set.
+    //
+    // To do this, an update is made by cloning the old set, updating the clone, then
+    // replacing the old with the new. Once made visible, each set remains constant.
+
+    // TODO: it's odd that findBreaks() can produce different results, depending
+    // on which scripts have been previously seen by handleChar(). (This is not a
+    // threading specific issue). Possibly stop on script boundaries?
+
+    final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
      public UnhandledBreakEngine() {
-        for (int i = 0; i < fHandled.length; i++) {
-            fHandled[i] = new UnicodeSet();
+        for (int i = 0; i < fHandled.length(); i++) {
+            fHandled.set(i, new UnicodeSet());
          }
      }
-    
+
+    @Override
      public boolean handles(int c, int breakType) {
-        return (breakType >= 0 && breakType < fHandled.length) && 
-                (fHandled[breakType].contains(c));
+        return (breakType >= 0 && breakType < fHandled.length()) &&
+                (fHandled.get(breakType).contains(c));
      }
  
+    @Override
      public int findBreaks(CharacterIterator text, int startPos, int endPos,
              boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
-        if (breakType >= 0 && breakType < fHandled.length) { 
-            int c = CharacterIteration.current32(text); 
-            if (reverse) { 
-                while (text.getIndex() > startPos && fHandled[breakType].contains(c)) { 
-                    CharacterIteration.previous32(text); 
-                    c = CharacterIteration.current32(text); 
-                } 
-            } else { 
-                while (text.getIndex() < endPos && fHandled[breakType].contains(c)) { 
-                    CharacterIteration.next32(text); 
-                    c = CharacterIteration.current32(text); 
-                } 
-            } 
-        } 
+        if (breakType >= 0 && breakType < fHandled.length()) {
+            UnicodeSet uniset = fHandled.get(breakType);
+            int c = CharacterIteration.current32(text);
+            if (reverse) {
+                while (text.getIndex() > startPos && uniset.contains(c)) {
+                    CharacterIteration.previous32(text);
+                    c = CharacterIteration.current32(text);
+                }
+            } else {
+                while (text.getIndex() < endPos && uniset.contains(c)) {
+                    CharacterIteration.next32(text);
+                    c = CharacterIteration.current32(text);
+                }
+            }
+        }
          return 0;
      }
  
-    public synchronized void handleChar(int c, int breakType) {
-        if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
-            if (!fHandled[breakType].contains(c)) {
+    /**
+     * Update the set of unhandled characters for the specified breakType to include
+     * all that have the same script as c.
+     * May be called concurrently with handles() or findBreaks().
+     * Must not be called concurrently with itself.
+     */
+    public void handleChar(int c, int breakType) {
+        if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
+            UnicodeSet originalSet = fHandled.get(breakType);
+            if (!originalSet.contains(c)) {
                  int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-                fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
+                UnicodeSet newSet = new UnicodeSet();
+                newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
+                newSet.addAll(originalSet);
+                fHandled.set(breakType, newSet);
              }
          }
      }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

index c08b078c364f3f097277769d1a59b059261b294e..c66052f91f3308f6b6ed57aa9797fd65a2f4f35f 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -851,6 +851,89 @@ public class RBBITest extends TestFmwk {
          bi.setText("abc");
          bi.first();
          assertEquals("Rule chaining test", 3,  bi.next());
-         }
      }
  
+
+    @Test
+    public void TestBug12873() {
+        // Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
+        // matches not being cloned when a break iterator is cloned. This resulted in usage
+        // collisions if the original break iterator and its clone were used concurrently.
+
+        // The Line Break rules for Regional Indicators make use of look-ahead rules, and
+        // show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
+        // Regional indicators group into pairs, expect breaks after two code points, which
+        // is after four 16 bit code units.
+
+        final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
+        final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
+        final AssertionError[] assertErr = new AssertionError[1];  // saves an error found from within a thread
+
+        class WorkerThread implements Runnable {
+            @Override
+            public void run() {
+                try {
+                    RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
+                    localBI.setText(dataToBreak);
+                    for (int loop=0; loop<100; loop++) {
+                        int nextExpectedBreak = 0;
+                        for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
+                                actualBreak = localBI.next(), nextExpectedBreak+= 4) {
+                            assertEquals("", nextExpectedBreak, actualBreak);
+                        }
+                        assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
+                    }
+                } catch (AssertionError e) {
+                    assertErr[0] = e;
+                }
+            }
+        }
+
+        List<Thread> threads = new ArrayList<Thread>();
+        for (int n = 0; n<4; ++n) {
+            threads.add(new Thread(new WorkerThread()));
+        }
+        for (Thread thread: threads) {
+            thread.start();
+        }
+        for (Thread thread: threads) {
+            try {
+                thread.join();
+            } catch (InterruptedException e) {
+                fail(e.toString());
+            }
+        }
+
+        // JUnit wont see failures from within the worker threads, so
+        // check again if one occurred.
+        if (assertErr[0] != null) {
+            throw assertErr[0];
+        }
+    }
+
+    @Test
+    public void TestBreakAllChars() {
+        // Make a "word" from each code point, separated by spaces.
+        // For dictionary based breaking, runs the start-of-range
+        // logic with all possible dictionary characters.
+        StringBuilder sb = new StringBuilder();
+        for (int c=0; c<0x110000; ++c) {
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.append(' ');
+        }
+        String s = sb.toString();
+
+        for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
+            RuleBasedBreakIterator bi =
+                    (RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
+            bi.setText(s);
+            int lastb = -1;
+            for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
+                assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
+            }
+        }
+    }
+}
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Tue, 17 Jan 2017 23:13:25 +0000 (23:13 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java		patch \| blob \| history