ICU-8762 Merging #8549 UTF-7 error handling consumes too many valid subsequent chars...

author Yoshito Umaoka <y.umaoka@gmail.com>

Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)

committer Yoshito Umaoka <y.umaoka@gmail.com>

Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)
author Yoshito Umaoka <y.umaoka@gmail.com>
Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)
committer Yoshito Umaoka <y.umaoka@gmail.com>
Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)
diff --git a/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java b/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java

index 9447bdb8b03ffb2c6f473684edb76cd12410c7b3..2fc0fe7dfef42bf6fa2b19e332c8bf7c7d8d20f9 100644 (file)
--- a/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java
+++ b/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java
@@ -1,6 +1,6 @@
  /*
   *******************************************************************************
- * Copyright (C) 2007-2009, International Business Machines Corporation and         *
+ * Copyright (C) 2007-2011, International Business Machines Corporation and         *
   * others. All Rights Reserved.                                                *
   *******************************************************************************
   */
@@ -276,12 +276,52 @@ class CharsetUTF7 extends CharsetICU {
                              b=(char)source.get();
                              sourceArrayIndex++;
                              toUBytesArray[byteIndex++]=(byte)b;
-                            if ((!useIMAP && b>=126) || (useIMAP && b>0x7e)) {
-                                /* illegal - test other illegal US-ASCII values by base64Value==-3 */
+                            base64Value = -3; /* initialize as illegal */
+                            if ((!useIMAP && (b>=126 || (base64Value=FROM_BASE_64[b])==-3 || base64Value==-1)) || (useIMAP && b>0x7e)) {
+                                /* either
+                                 * base64Value==-1 for any legal character except base64 and minus sign, or
+                                 * base64Value==-3 for illegal characters:
+                                 * 1. In either case, leave Unicode mode.
+                                 * 2.1. If we ended with an incomplete UChar or none after the +, then
+                                 *      generate an error for the preceding erroneous sequence and deal with
+                                 *      the current (possibly illegal) character next time through.
+                                 * 2.2. Else the current char comes after a complete UChar, which was already
+                                 *      pushed to the output buf, so:
+                                 * 2.2.1. If the current char is legal, just save it for processing next time.
+                                 *        It may be for example, a plus which we need to deal with in direct mode.
+                                 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
+                                 */
                                  inDirectMode=1;
-                                cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                break directMode;
-                            } else if (((base64Value=FROM_BASE_64[b])>=0 && !useIMAP) || ((base64Value=FROM_BASE64_IMAP(b))>=0) && useIMAP) {
+                                
+                                if(base64Counter==-1) {
+                                    /* illegal: + immediately followed by something other than base64 or minus sign */
+                                    /* include the plus sign in the reported sequence, but not the subsequent char */
+                                    source.position(source.position() -1);
+                                    toUBytesArray[0]=PLUS;
+                                    byteIndex=1;
+                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                    break directMode;
+                                } else if(bits!=0) {
+                                    /* bits are illegally left over, a UChar is incomplete */
+                                    /* don't include current char (legal or illegal) in error seq */
+                                    source.position(source.position() -1);
+                                    --byteIndex;
+                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                    break directMode;
+                                } else {
+                                    /* previous UChar was complete */
+                                    if(base64Value==-3) {
+                                        /* current character is illegal, deal with it here */
+                                        cr=CoderResult.malformedForLength(sourceArrayIndex);
+                                        break directMode;
+                                    } else {
+                                        /* un-read the current character in case it is a plus sign */
+                                        source.position(source.position() -1);
+                                        sourceIndex=nextSourceIndex-1;
+                                        continue directMode;
+                                    }
+                                }
+                            } else if ((!useIMAP && (base64Value=FROM_BASE_64[b])>=0) || (useIMAP && (base64Value=FROM_BASE64_IMAP(b))>=0)) {
                                  /* collect base64 bytes */
                                  switch (base64Counter) {
                                  case -1: /* -1 is immediately after the + */
@@ -356,7 +396,7 @@ class CharsetUTF7 extends CharsetICU {
                                      /* will never occur */
                                      //break;                                                           
                                  }//end of switch
-                            } else if (base64Value==-2) {
+                            } else if (!useIMAP || (useIMAP && base64Value==-2)) {
                                  /* minus sign terminates the base64 sequence */
                                  inDirectMode=1;
                                  if (base64Counter==-1) {
@@ -375,30 +415,8 @@ class CharsetUTF7 extends CharsetICU {
                                  }
                                  sourceIndex=nextSourceIndex;
                                  continue directMode;
-                            } else if (!useIMAP && base64Value==-1) { /* for any legal character except base64 and minus sign */
-                                /* leave the Unicode Mode */
-                                inDirectMode=1;
+                            } else if (useIMAP) { 
                                  if (base64Counter==-1) {
-                                    /* illegal:  + immediately followed by something other than base64 minus sign */
-                                    /* include the plus sign in the reported sequence */
-                                    --sourceIndex;
-                                    toUBytesArray[0]=PLUS;
-                                    toUBytesArray[1]=(byte)b;
-                                    byteIndex=2;
-                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                    break;
-                                } else if (bits==0) {
-                                    /* un-read the character in case it is a plus sign */
-                                    source.position(--sourceArrayIndex);
-                                    sourceIndex=nextSourceIndex - 1;
-                                    continue directMode;
-                                } else {
-                                    /* bits are illegally left over, a unicode character is incomplete */
-                                    cr=CoderResult.malformedForLength(sourceArrayIndex);
-                                    break;
-                                }
-                            } else { 
-                                if (useIMAP && base64Counter==-1) {
                                      // illegal: & immediately followed by something other than base64 or minus sign
                                      // include the ampersand in the reported sequence
                                      --sourceIndex;
diff --git a/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java b/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java

index 68c9d4ce3a17ab336f730fc44b453ab79dbc7c3b..d0c434f18bf7e02a5e34812a2d38ba84f26b35de 100644 (file)
--- a/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
@@ -1,6 +1,6 @@
  /**
  *******************************************************************************
-* Copyright (C) 2006-2010, International Business Machines Corporation and    *
+* Copyright (C) 2006-2011, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  *
@@ -2787,6 +2787,27 @@ public class TestCharset extends TestFmwk {
          smBufDecode(decoder, "UTF-7", bs, us);
          smBufEncode(encoder, "UTF-7", us, bs);
          
+        /* Testing UTF-7 toUnicode with substitute callbacks */
+        {
+            byte [] bytesTestErrorConsumption = {
+                    /* a~       a+AB~                         a+AB\x0c                      a+AB-                         a+AB.                         a+. */
+                    0x61, 0x7e, 0x61, 0x2b, 0x41, 0x42, 0x7e, 0x61, 0x2b, 0x41, 0x42, 0x0c, 0x61, 0x2b, 0x41, 0x42, 0x2d, 0x61, 0x2b, 0x41, 0x42, 0x2e, 0x61, 0x2b, 0x2e
+    
+            };
+            char [] unicodeTestErrorConsumption = {
+                    0x61, 0xfffd, 0x61, 0xfffd, 0xfffd, 0x61, 0xfffd, 0xfffd, 0x61, 0xfffd, 0x61, 0xfffd, 0x2e, 0x61, 0xfffd, 0x2e
+            };
+            bs = ByteBuffer.wrap(bytesTestErrorConsumption);
+            us = CharBuffer.wrap(unicodeTestErrorConsumption);
+    
+            CodingErrorAction savedMal = decoder.malformedInputAction();
+            CodingErrorAction savedUMap = decoder.unmappableCharacterAction();
+            decoder.onMalformedInput(CodingErrorAction.REPLACE);
+            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+            smBufDecode(decoder, "UTF-7 DE Error Consumption", bs, us);
+            decoder.onMalformedInput(savedMal);
+            decoder.onUnmappableCharacter(savedUMap);
+        }
          /* ticket 6151 */
          CharBuffer smallus = CharBuffer.allocate(1);
          ByteBuffer bigbs = ByteBuffer.allocate(3);
author	Yoshito Umaoka <y.umaoka@gmail.com>
	Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)
committer	Yoshito Umaoka <y.umaoka@gmail.com>
	Tue, 16 Aug 2011 22:24:34 +0000 (22:24 +0000)
main/classes/charset/src/com/ibm/icu/charset/CharsetUTF7.java		patch \| blob \| history
main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java		patch \| blob \| history