ICU-8549 UTF-7 error consumption fix to ICU4J

author Michael Ow <mow@svn.icu-project.org>

Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)

committer Michael Ow <mow@svn.icu-project.org>

Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)
author Michael Ow <mow@svn.icu-project.org>
Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)
committer Michael Ow <mow@svn.icu-project.org>
Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)
diff --git a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java

index 2652c769a171bcb27e459f0fb61526cb505f5e4c..65e48dd7bd9204f5160002be6c96cd554d2ed469 100644 (file)
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java
@@ -1,6 +1,6 @@
  /*
   *******************************************************************************
- * Copyright (C) 2007-2010, International Business Machines Corporation and         *
+ * Copyright (C) 2007-2011, International Business Machines Corporation and         *
   * others. All Rights Reserved.                                                *
   *******************************************************************************
   */
@@ -278,12 +278,52 @@ class CharsetUTF7 extends CharsetICU {
                              b=(char)source.get();
                              sourceArrayIndex++;
                              toUBytesArray[byteIndex++]=(byte)b;
-                            if ((!useIMAP && b>=126) || (useIMAP && b>0x7e)) {
-                                /* illegal - test other illegal US-ASCII values by base64Value==-3 */
+                            base64Value = -3; /* initialize as illegal */
+                            if ((!useIMAP && (b>=126 || (base64Value=FROM_BASE_64[b])==-3 || base64Value==-1)) || (useIMAP && b>0x7e)) {
+                                /* either
+                                 * base64Value==-1 for any legal character except base64 and minus sign, or
+                                 * base64Value==-3 for illegal characters:
+                                 * 1. In either case, leave Unicode mode.
+                                 * 2.1. If we ended with an incomplete UChar or none after the +, then
+                                 *      generate an error for the preceding erroneous sequence and deal with
+                                 *      the current (possibly illegal) character next time through.
+                                 * 2.2. Else the current char comes after a complete UChar, which was already
+                                 *      pushed to the output buf, so:
+                                 * 2.2.1. If the current char is legal, just save it for processing next time.
+                                 *        It may be for example, a plus which we need to deal with in direct mode.
+                                 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
+                                 */
                                  inDirectMode=1;
-                                cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                break directMode;
-                            } else if (((base64Value=FROM_BASE_64[b])>=0 && !useIMAP) || ((base64Value=FROM_BASE64_IMAP(b))>=0) && useIMAP) {
+                                
+                                if(base64Counter==-1) {
+                                    /* illegal: + immediately followed by something other than base64 or minus sign */
+                                    /* include the plus sign in the reported sequence, but not the subsequent char */
+                                    source.position(source.position() -1);
+                                    toUBytesArray[0]=PLUS;
+                                    byteIndex=1;
+                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                    break directMode;
+                                } else if(bits!=0) {
+                                    /* bits are illegally left over, a UChar is incomplete */
+                                    /* don't include current char (legal or illegal) in error seq */
+                                    source.position(source.position() -1);
+                                    --byteIndex;
+                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                    break directMode;
+                                } else {
+                                    /* previous UChar was complete */
+                                    if(base64Value==-3) {
+                                        /* current character is illegal, deal with it here */
+                                        cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                        break directMode;
+                                    } else {
+                                        /* un-read the current character in case it is a plus sign */
+                                        source.position(source.position() -1);
+                                        sourceIndex=nextSourceIndex-1;
+                                        continue directMode;
+                                    }
+                                }
+                            } else if ((!useIMAP && (base64Value=FROM_BASE_64[b])>=0) || (useIMAP && (base64Value=FROM_BASE64_IMAP(b))>=0)) {
                                  /* collect base64 bytes */
                                  switch (base64Counter) {
                                  case -1: /* -1 is immediately after the + */
@@ -358,7 +398,7 @@ class CharsetUTF7 extends CharsetICU {
                                      /* will never occur */
                                      //break;                                                           
                                  }//end of switch
-                            } else if (base64Value==-2) {
+                            } else if (!useIMAP || (useIMAP && base64Value==-2)) {
                                  /* minus sign terminates the base64 sequence */
                                  inDirectMode=1;
                                  if (base64Counter==-1) {
@@ -377,30 +417,8 @@ class CharsetUTF7 extends CharsetICU {
                                  }
                                  sourceIndex=nextSourceIndex;
                                  continue directMode;
-                            } else if (!useIMAP && base64Value==-1) { /* for any legal character except base64 and minus sign */
-                                /* leave the Unicode Mode */
-                                inDirectMode=1;
+                            } else if (useIMAP) { 
                                  if (base64Counter==-1) {
-                                    /* illegal:  + immediately followed by something other than base64 minus sign */
-                                    /* include the plus sign in the reported sequence */
-                                    --sourceIndex;
-                                    toUBytesArray[0]=PLUS;
-                                    toUBytesArray[1]=(byte)b;
-                                    byteIndex=2;
-                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                    break;
-                                } else if (bits==0) {
-                                    /* un-read the character in case it is a plus sign */
-                                    source.position(--sourceArrayIndex);
-                                    sourceIndex=nextSourceIndex - 1;
-                                    continue directMode;
-                                } else {
-                                    /* bits are illegally left over, a unicode character is incomplete */
-                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                    break;
-                                }
-                            } else { 
-                                if (useIMAP && base64Counter==-1) {
                                      // illegal: & immediately followed by something other than base64 or minus sign
                                      // include the ampersand in the reported sequence
                                      --sourceIndex;
diff --git a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java

index 702555fbd61d4ad030c5f637ae36f1032651cfdf..79b32df452de90de1482f8c8d9c6a449fbb8b1f8 100644 (file)
--- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
@@ -2787,6 +2787,27 @@ public class TestCharset extends TestFmwk {
          smBufDecode(decoder, "UTF-7", bs, us);
          smBufEncode(encoder, "UTF-7", us, bs);
          
+        /* Testing UTF-7 toUnicode with substitute callbacks */
+        {
+            byte [] bytesTestErrorConsumption = {
+                    /* a~       a+AB~                         a+AB\x0c                      a+AB-                         a+AB.                         a+. */
+                    0x61, 0x7e, 0x61, 0x2b, 0x41, 0x42, 0x7e, 0x61, 0x2b, 0x41, 0x42, 0x0c, 0x61, 0x2b, 0x41, 0x42, 0x2d, 0x61, 0x2b, 0x41, 0x42, 0x2e, 0x61, 0x2b, 0x2e
+    
+            };
+            char [] unicodeTestErrorConsumption = {
+                    0x61, 0xfffd, 0x61, 0xfffd, 0xfffd, 0x61, 0xfffd, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x2e, 0x61, 0xfffd, 0x2e
+            };
+            bs = ByteBuffer.wrap(bytesTestErrorConsumption);
+            us = CharBuffer.wrap(unicodeTestErrorConsumption);
+    
+            CodingErrorAction savedMal = decoder.malformedInputAction();
+            CodingErrorAction savedUMap = decoder.unmappableCharacterAction();
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            smBufDecode(decoder, "UTF-7 DE Error Consumption", bs, us);
+            decoder.onMalformedInput(savedMal);
+            decoder.onUnmappableCharacter(savedUMap);
+        }
          /* ticket 6151 */
          CharBuffer smallus = CharBuffer.allocate(1);
          ByteBuffer bigbs = ByteBuffer.allocate(3);
author	Michael Ow <mow@svn.icu-project.org>
	Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)
committer	Michael Ow <mow@svn.icu-project.org>
	Fri, 27 May 2011 17:08:14 +0000 (17:08 +0000)
icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java		patch \| blob \| history
icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java		patch \| blob \| history