ICU-9267 ICU4J Charset Detector Crash Fix

author Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java

index 7fc8553f4308d30f7612cb132a998c54f8b7276b..16bbf22521b156bc58e1cab34bb6b5ec4bba7ddd 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
@@ -1,6 +1,6 @@
  /**
  *******************************************************************************
-* Copyright (C) 2005-2011, International Business Machines Corporation and    *
+* Copyright (C) 2005-2012, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
@@ -417,7 +417,7 @@ public class CharsetDetector {
      byte[]      fInputBytes =       // The text to be checked.  Markup will have been
                     new byte[kBufSize];  //   removed if appropriate.
      
-    int         fInputLen;          // Length of the byte data in fInputText.
+    int         fInputLen;          // Length of the byte data in fInputBytes.
      
      short       fByteStats[] =      // byte frequency statistics for the input text.
                     new short[256];  //   Value is percent, not absolute.
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java

index c8e752c028f0646f747cde1eef437856ba9ce061..033612044bb504b673da157cdc620f7acb0df8b2 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
@@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          //arabic shaping class, method shape/unshape
          protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
          protected byte[] prev_fInputBytes = null;
+        protected int prev_fInputLen = 0;
  
          protected static byte[] byteMap = {
  /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
@@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          protected void matchInit(CharsetDetector det) 
          {
              assert prev_fInputBytes == null;
-            prev_fInputBytes = new byte[det.fInputLen];
-            System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen);
-            byte bb[] = unshape(prev_fInputBytes);
-            System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length);
-            det.fInputLen = bb.length;
+            prev_fInputBytes = det.fInputBytes;
+            prev_fInputLen = det.fInputLen;
+            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
+            det.fInputLen = det.fInputBytes.length;
          }
          
          /*
@@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
           * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
           * of JDK can produce different results and therefore is also avoided.
           */
-        private byte[] unshape(byte[] inputBytes) {
-            byte resultByteArr[] = unshapeLamAlef(inputBytes);
+        private byte[] unshape(byte[] inputBytes, int inputLen) {
+            byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
              
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<resultByteArr.length; i++){
                  resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
              }
              return resultByteArr;
          }
  
-        private byte[] unshapeLamAlef(byte[] inputBytes) {
-            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputBytes.length*2);
+        private byte[] unshapeLamAlef(byte[] inputBytes, int inputLen) {
+            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputLen*2);
              ByteBuffer resultBuffer;
              byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};
  
             
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<inputLen; i++){
                  if (isLamAlef(inputBytes[i]))
                      resultBigBuffer.put(unshapedLamAlef);
                  else
@@ -1229,8 +1229,8 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          
          protected void matchFinish(CharsetDetector det) {
              if (prev_fInputBytes != null) {
-                System.arraycopy(prev_fInputBytes, 0, det.fInputBytes, 0, prev_fInputBytes.length);
-                det.fInputLen = prev_fInputBytes.length;
+                det.fInputBytes = prev_fInputBytes;
+                det.fInputLen = prev_fInputLen;
                  prev_fInputBytes = null;
              }
          }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

index 6409195e2df42a6cae1ecdd298f12003ccb8fb22..6218e0633b234e9075bc56d617ffe88f620c81ba 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@@ -1057,5 +1057,20 @@ public class TestCharsetDetector extends TestFmwk
        //
        // End of Bug #8309 Test Case
        //
+
+
+    public void TestBut9267() {
+        // Test a long input of Lam Alef characters for CharsetRecog_IBM424_he.
+        // Bug 9267 was an array out of bounds problem in the unshaping code for these.
+        byte [] input = new byte [7700]; 
+        int i;
+        for (i=0; i<input.length; i++) {
+          input[i] = (byte)0xb2;
+        }
+        CharsetDetector det = new CharsetDetector();
+        det.setText(input);
+        det.detect();
+    }    
+
        
-  }
+}
author	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java		patch \| blob \| history