From fdea4100328ccd78f4504401145574b91ac6575a Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Wed, 18 Apr 2012 00:01:23 +0000 Subject: [PATCH] ICU-9267 ICU4J Charset Detector Crash Fix X-SVN-Rev: 31724 --- .../src/com/ibm/icu/text/CharsetDetector.java | 4 +-- .../com/ibm/icu/text/CharsetRecog_sbcs.java | 26 +++++++++---------- .../test/charsetdet/TestCharsetDetector.java | 17 +++++++++++- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java index 7fc8553f430..16bbf22521b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java @@ -1,6 +1,6 @@ /** ******************************************************************************* -* Copyright (C) 2005-2011, International Business Machines Corporation and * +* Copyright (C) 2005-2012, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -417,7 +417,7 @@ public class CharsetDetector { byte[] fInputBytes = // The text to be checked. Markup will have been new byte[kBufSize]; // removed if appropriate. - int fInputLen; // Length of the byte data in fInputText. + int fInputLen; // Length of the byte data in fInputBytes. short fByteStats[] = // byte frequency statistics for the input text. new short[256]; // Value is percent, not absolute. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java index c8e752c028f..033612044bb 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java @@ -1131,6 +1131,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { //arabic shaping class, method shape/unshape protected static ArabicShaping as = new ArabicShaping(ArabicShaping.LETTERS_UNSHAPE); protected byte[] prev_fInputBytes = null; + protected int prev_fInputLen = 0; protected static byte[] byteMap = { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ @@ -1179,11 +1180,10 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { protected void matchInit(CharsetDetector det) { assert prev_fInputBytes == null; - prev_fInputBytes = new byte[det.fInputLen]; - System.arraycopy(det.fInputBytes, 0, prev_fInputBytes, 0, det.fInputLen); - byte bb[] = unshape(prev_fInputBytes); - System.arraycopy(bb, 0, det.fInputBytes, 0, bb.length); - det.fInputLen = bb.length; + prev_fInputBytes = det.fInputBytes; + prev_fInputLen = det.fInputLen; + det.fInputBytes = unshape(prev_fInputBytes, prev_fInputLen); + det.fInputLen = det.fInputBytes.length; } /* @@ -1193,22 +1193,22 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { * on CharsetICU which we try to avoid. IBM420 converter amongst different versions * of JDK can produce different results and therefore is also avoided. */ - private byte[] unshape(byte[] inputBytes) { - byte resultByteArr[] = unshapeLamAlef(inputBytes); + private byte[] unshape(byte[] inputBytes, int inputLen) { + byte resultByteArr[] = unshapeLamAlef(inputBytes, inputLen); - for (int i=0; i