]> granicus.if.org Git - icu/commitdiff
ICU-9267 ICU4J Charset Detector Crash Fix
authorAndy Heninger <andy.heninger@gmail.com>
Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Wed, 18 Apr 2012 00:01:23 +0000 (00:01 +0000)
X-SVN-Rev: 31724

icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

index 7fc8553f4308d30f7612cb132a998c54f8b7276b..16bbf22521b156bc58e1cab34bb6b5ec4bba7ddd 100644 (file)
@@ -1,6 +1,6 @@
 /**
 *******************************************************************************
-* Copyright (C) 2005-2011, International Business Machines Corporation and    *
+* Copyright (C) 2005-2012, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@@ -417,7 +417,7 @@ public class CharsetDetector {
     byte[]      fInputBytes =       // The text to be checked.  Markup will have been
                    new byte[kBufSize];  //   removed if appropriate.
     
-    int         fInputLen;          // Length of the byte data in fInputText.
+    int         fInputLen;          // Length of the byte data in fInputBytes.
     
     short       fByteStats[] =      // byte frequency statistics for the input text.
                    new short[256];  //   Value is percent, not absolute.
index c8e752c028f0646f747cde1eef437856ba9ce061..033612044bb504b673da157cdc620f7acb0df8b2 100644 (file)
@@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         //arabic shaping class, method shape/unshape
         protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE);
         protected byte[] prev_fInputBytes = null;
+        protected int prev_fInputLen = 0;
 
         protected static byte[] byteMap = {
 /*                 -0           -1           -2           -3           -4           -5           -6           -7           -8           -9           -A           -B           -C           -D           -E           -F   */
@@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         protected void matchInit(CharsetDetector det) 
         {
             assert prev_fInputBytes == null;
-            prev_fInputBytes = new byte[det.fInputLen];
-            System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen);
-            byte bb[] = unshape(prev_fInputBytes);
-            System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length);
-            det.fInputLen = bb.length;
+            prev_fInputBytes = det.fInputBytes;
+            prev_fInputLen = det.fInputLen;
+            det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen);
+            det.fInputLen = det.fInputBytes.length;
         }
         
         /*
@@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
          * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
          * of JDK can produce different results and therefore is also avoided.
          */
-        private byte[] unshape(byte[] inputBytes) {
-            byte resultByteArr[] = unshapeLamAlef(inputBytes);
+        private byte[] unshape(byte[] inputBytes, int inputLen) {
+            byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen);
             
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<resultByteArr.length; i++){
                 resultByteArr[i] = unshapeMap[resultByteArr[i]& 0xFF];
             }
             return resultByteArr;
         }
 
-        private byte[] unshapeLamAlef(byte[] inputBytes) {
-            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputBytes.length*2);
+        private byte[] unshapeLamAlef(byte[] inputBytes, int inputLen) {
+            ByteBuffer resultBigBuffer =  ByteBuffer.allocate(inputLen*2);
             ByteBuffer resultBuffer;
             byte unshapedLamAlef[] = {(byte)0xb1, (byte)0x56};
 
            
-            for (int i=0; i<inputBytes.length; i++){
+            for (int i=0; i<inputLen; i++){
                 if (isLamAlef(inputBytes[i]))
                     resultBigBuffer.put(unshapedLamAlef);
                 else
@@ -1229,8 +1229,8 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
         
         protected void matchFinish(CharsetDetector det) {
             if (prev_fInputBytes != null) {
-                System.arraycopy(prev_fInputBytes, 0, det.fInputBytes, 0, prev_fInputBytes.length);
-                det.fInputLen = prev_fInputBytes.length;
+                det.fInputBytes = prev_fInputBytes;
+                det.fInputLen = prev_fInputLen;
                 prev_fInputBytes = null;
             }
         }
index 6409195e2df42a6cae1ecdd298f12003ccb8fb22..6218e0633b234e9075bc56d617ffe88f620c81ba 100644 (file)
@@ -1057,5 +1057,20 @@ public class TestCharsetDetector extends TestFmwk
       //
       // End of Bug #8309 Test Case
       //
+
+
+    public void TestBut9267() {
+        // Test a long input of Lam Alef characters for CharsetRecog_IBM424_he.
+        // Bug 9267 was an array out of bounds problem in the unshaping code for these.
+        byte [] input = new byte [7700]; 
+        int i;
+        for (i=0; i<input.length; i++) {
+          input[i] = (byte)0xb2;
+        }
+        CharsetDetector det = new CharsetDetector();
+        det.setText(input);
+        det.detect();
+    }    
+
       
-  }
+}