ICU-9657 Thread safety fix in charset detector.

author Andy Heninger <andy.heninger@gmail.com>

Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java

index 1ea6d2fc726b1fad2ce84eb226f26e4863f2da7c..2d3b0c13be486152b378abf1f5236b92c7d79165 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
@@ -1,6 +1,6 @@
  /*
   ****************************************************************************
- * Copyright (C) 2005-2012, International Business Machines Corporation and *
+ * Copyright (C) 2005-2013, International Business Machines Corporation and *
   * others. All Rights Reserved.                                             *
   ************************************************************************** *
   *
@@ -1033,8 +1033,6 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
      {
          //arabic shaping class, method shape/unshape
          protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
-        protected byte[] prev_fInputBytes = null;
-        protected int prev_fInputLen = 0;
  
          protected static byte[] byteMap = {
  /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
@@ -1080,15 +1078,6 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          {
              return "ar";
          }
-        protected void matchInit(CharsetDetector det) 
-        {
-            assert prev_fInputBytes == null;
-            prev_fInputBytes = det.fInputBytes;
-            prev_fInputLen = det.fInputLen;
-            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
-            det.fInputLen = det.fInputBytes.length;
-        }
-        
          /*
           * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
           * because CharsetDetector is dealing with bytes not Unicode code points. We could
@@ -1096,7 +1085,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
           * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
           * of JDK can produce different results and therefore is also avoided.
           */
-        private byte[] unshape(byte[] inputBytes, int inputLen) {
+        byte[] unshape(byte[] inputBytes, int inputLen) {
              byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
              
              for (int i=0; i<resultByteArr.length; i++){
@@ -1128,15 +1117,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
                      return true;
              return false;
          }
-        
-        protected void matchFinish(CharsetDetector det) {
-            if (prev_fInputBytes != null) {
-                det.fInputBytes = prev_fInputBytes;
-                det.fInputLen = prev_fInputLen;
-                prev_fInputBytes = null;
-            }
-        }
-        
+                
      }
      static class CharsetRecog_IBM420_ar_rtl extends CharsetRecog_IBM420_ar 
      {
@@ -1153,9 +1134,15 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          }
          public CharsetMatch match(CharsetDetector det)
          {
-            matchInit(det);
+            byte[] prev_fInputBytes = det.fInputBytes;
+            int prev_fInputLen = det.fInputLen;
+            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
+            det.fInputLen = det.fInputBytes.length;
+
              int confidence =  match(det, ngrams, byteMap, (byte)0x40);
-            matchFinish(det);
+            
+            det.fInputBytes = prev_fInputBytes;
+            det.fInputLen = prev_fInputLen;
              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
          }
          
@@ -1175,9 +1162,15 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          }
          public CharsetMatch match(CharsetDetector det)
          {
-            matchInit(det);
+            byte[] prev_fInputBytes = det.fInputBytes;
+            int prev_fInputLen = det.fInputLen;
+            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
+            det.fInputLen = det.fInputBytes.length;
+            
              int confidence = match(det, ngrams, byteMap, (byte)0x40);
-            matchFinish(det);
+            
+            det.fInputBytes = prev_fInputBytes;
+            det.fInputLen = prev_fInputLen;
              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
          }
          
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

index 8f33cb3ca8b635e307900d54b471a6b039f54b06..fb6f7b843df25dbcaed5c762adfbd169a55869fc 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@@ -1,6 +1,6 @@
  /**
   *******************************************************************************
- * Copyright (C) 2005-2012, International Business Machines Corporation and    *
+ * Copyright (C) 2005-2013, International Business Machines Corporation and    *
   * others. All Rights Reserved.                                                *
   *******************************************************************************
   */
@@ -1126,6 +1126,51 @@ public class TestCharsetDetector extends TestFmwk
                          detectedEncodings.add(m.getName()));
          }   
      }
+    
+    public void TestMultithreaded() {
+        String  s = "This is some random plain text to run charset detection on.";
+        final byte [] bytes;
+        try {
+            bytes = s.getBytes("ISO-8859-1");
+        }
+        catch (Exception e) {
+            fail("Unexpected exception " + e.toString());
+            return;
+        }
+        
+        class WorkerThread extends Thread {
+            WorkerThread(int num) {
+                n = num;
+            }           
+            private int n;            
+            public void run() {
+                // System.out.println("Thread " + n + " is running.");
+                CharsetDetector det = new CharsetDetector();
+                det.setText(bytes);                
+                for (int i=0; i<10000; i++) {
+                    CharsetMatch matches[] = det.detectAll();
+                    for (CharsetMatch m: matches) {
+                        assertNotNull("Failure in thread " + n, m);
+                    }
+                }
+                // System.out.println("Thread " + n + " is finished.");
+            }
+        }
+        
+        Thread threads[] = new Thread[10];
+        for (int i=0; i<10; i++) {
+            threads[i] = new WorkerThread(i);
+            threads[i].start();
+        }
+        for (Thread thread: threads) {
+            try {
+                thread.join();
+            } catch(Exception e) {
+                fail("Unexpected exception " +  e.toString());
+                return;
+            }
+        }
+    }
  
        
  }
author	Andy Heninger <andy.heninger@gmail.com>
	Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Tue, 12 Feb 2013 23:11:58 +0000 (23:11 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java		patch \| blob \| history