From: Markus Scherer <markus.icu@gmail.com>
Date: Fri, 9 Feb 2018 21:01:56 +0000 (+0000)
Subject: ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence
X-Git-Tag: release-61-rc~110
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=19b494f953df2c5a4ffc7743d4e1e2bafdbde62e;p=icu

ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence

X-SVN-Rev: 40883
---

diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp
index 094e2dfb6f4..7089d9400c6 100644
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
         // Use a single counter for source and target, counting the minimum of
         // the source length and the target capacity.
         // Let the standard converter handle edge cases.
-        const uint8_t *limit=sourceLimit;
         if(count>targetCapacity) {
-            limit-=(count-targetCapacity);
             count=targetCapacity;
         }
 
-        // The conversion loop checks count>0 only once per 1/2/3-byte character.
-        // If the buffer ends with a truncated 2- or 3-byte sequence,
+        // The conversion loop checks count>0 only once per character.
+        // If the buffer ends with a truncated sequence,
         // then we reduce the count to stop before that,
         // and collect the remaining bytes after the conversion loop.
-        {
-            // Do not go back into the bytes that will be read for finishing a partial
-            // sequence from the previous buffer.
-            int32_t length=count-toULimit;
-            if(length>0) {
-                uint8_t b1=*(limit-1);
-                if(U8_IS_SINGLE(b1)) {
-                    // common ASCII character
-                } else if(U8_IS_TRAIL(b1) && length>=2) {
-                    uint8_t b2=*(limit-2);
-                    if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
-                        // truncated 3-byte sequence
-                        count-=2;
-                    }
-                } else if(0xc2<=b1 && b1<0xf0) {
-                    // truncated 2- or 3-byte sequence
-                    --count;
-                }
-            }
-        }
+
+        // Do not go back into the bytes that will be read for finishing a partial
+        // sequence from the previous buffer.
+        int32_t length=count-toULimit;
+        U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
+        count=toULimit+length;
     }
 
     if(c!=0) {
@@ -815,7 +799,7 @@ moreBytes:
             }
 
             /* copy the legal byte sequence to the target */
-            if(count>=toULength) {
+            {
                 int8_t i;
 
                 for(i=0; i<oldToULength; ++i) {
@@ -826,14 +810,6 @@ moreBytes:
                     *target++=*source++;
                 }
                 count-=toULength;
-            } else {
-                // A supplementary character that does not fit into the target.
-                // Let the standard converter handle this.
-                source-=(toULength-oldToULength);
-                pToUArgs->source=(char *)source;
-                pFromUArgs->target=(char *)target;
-                *pErrorCode=U_USING_DEFAULT_WARNING;
-                return;
             }
         }
     }
@@ -857,8 +833,7 @@ moreBytes:
                         utf8->toULength=toULength;
                         utf8->mode=toULimit;
                         break;
-                    } else if(!U8_IS_TRAIL(b=*source)) {
-                        /* lead byte in trail byte position */
+                    } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
                         utf8->toULength=toULength;
                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                         break;
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h
index 021fdcf1f24..555e8bd4e28 100644
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * If the offset points to a UTF-8 trail byte,
  * then the offset is moved backward to the corresponding lead byte.
  * Otherwise, it is not modified.
+ *
  * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
  *
  * @param s const uint8_t * string
  * @param start int32_t starting string offset (usually 0)
  * @param i int32_t string offset, must be start<=i
  * @see U8_SET_CP_START_UNSAFE
+ * @see U8_TRUNCATE_IF_INCOMPLETE
  * @stable ICU 2.4
  */
 #define U8_SET_CP_START(s, start, i) { \
@@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
     } \
 }
 
+/**
+ * If the string ends with a UTF-8 byte sequence that is valid so far
+ * but incomplete, then reduce the length of the string to end before
+ * the lead byte of that incomplete sequence.
+ * For example, if the string ends with E1 80, the length is reduced by 2.
+ *
+ * Useful for processing text split across multiple buffers
+ * (save the incomplete sequence for later)
+ * and for optimizing iteration
+ * (check for string length only once per character).
+ *
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_SET_CP_START(), this macro never reads s[length].
+ *
+ * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param length int32_t string length, must be start<=length
+ * @see U8_SET_CP_START
+ * @draft ICU 61
+ */
+#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
+    if((length)>(start)) { \
+        uint8_t __b1=s[(length)-1]; \
+        if(U8_IS_SINGLE(__b1)) { \
+            /* common ASCII character */ \
+        } else if(U8_IS_LEAD(__b1)) { \
+            --(length); \
+        } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
+            uint8_t __b2=s[(length)-2]; \
+            if(0xe0<=__b2 && __b2<=0xf4) { \
+                if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
+                        U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
+                    (length)-=2; \
+                } \
+            } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
+                uint8_t __b3=s[(length)-3]; \
+                if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
+                    (length)-=3; \
+                } \
+            } \
+        } \
+    }
+
 /* definitions with backward iteration -------------------------------------- */
 
 /**
diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp
index f78c566e098..9dd241a12bf 100644
--- a/icu4c/source/common/utf_impl.cpp
+++ b/icu4c/source/common/utf_impl.cpp
@@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
     int32_t i=*pi;
     if(U8_IS_TRAIL(c) && i>start) {
         uint8_t b1=s[--i];
-        if(0xc2<=b1 && b1<0xe0) {
-            *pi=i;
-            return ((b1-0xc0)<<6)|(c&0x3f);
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0) {
+                *pi=i;
+                return ((b1-0xc0)<<6)|(c&0x3f);
+            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+                // Truncated 3- or 4-byte sequence.
+                *pi=i;
+                return errorValue(1, strict);
+            }
         } else if(U8_IS_TRAIL(b1) && i>start) {
             // Extract the value bits from the last trail byte.
             c&=0x3f;
             uint8_t b2=s[--i];
-            if(0xe0<=b2 && b2<0xf0) {
-                b2&=0xf;
-                if(strict!=-2) {
-                    if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
-                        *pi=i;
-                        c=(b2<<12)|((b1&0x3f)<<6)|c;
-                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
-                            return c;
-                        } else {
-                            // strict: forbid non-characters like U+fffe
-                            return errorValue(2, strict);
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0) {
+                    b2&=0xf;
+                    if(strict!=-2) {
+                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                            *pi=i;
+                            c=(b2<<12)|((b1&0x3f)<<6)|c;
+                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                                return c;
+                            } else {
+                                // strict: forbid non-characters like U+fffe
+                                return errorValue(2, strict);
+                            }
+                        }
+                    } else {
+                        // strict=-2 -> lenient: allow surrogates
+                        b1-=0x80;
+                        if((b2>0 || b1>=0x20)) {
+                            *pi=i;
+                            return (b2<<12)|(b1<<6)|c;
                         }
                     }
-                } else {
-                    // strict=-2 -> lenient: allow surrogates
-                    b1-=0x80;
-                    if((b2>0 || b1>=0x20)) {
-                        *pi=i;
-                        return (b2<<12)|(b1<<6)|c;
-                    }
+                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    // Truncated 4-byte sequence.
+                    *pi=i;
+                    return errorValue(2, strict);
                 }
             } else if(U8_IS_TRAIL(b2) && i>start) {
                 uint8_t b3=s[--i];
@@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                         }
                     }
                 }
-            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
-                // Truncated 4-byte sequence.
-                *pi=i;
-                return errorValue(2, strict);
             }
-        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
-            // Truncated 3- or 4-byte sequence.
-            *pi=i;
-            return errorValue(1, strict);
         }
     }
     return errorValue(0, strict);
@@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
     uint8_t c=s[i];
     if(U8_IS_TRAIL(c) && i>start) {
         uint8_t b1=s[--i];
-        if(0xc2<=b1 && b1<0xe0) {
-            return i;
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0 ||
+                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+                return i;
+            }
         } else if(U8_IS_TRAIL(b1) && i>start) {
             uint8_t b2=s[--i];
-            if(0xe0<=b2 && b2<0xf0) {
-                if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
                     return i;
                 }
             } else if(U8_IS_TRAIL(b2) && i>start) {
                 uint8_t b3=s[--i];
-                if(0xf0<=b3 && b3<=0xf4) {
-                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
-                        return i;
-                    }
+                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                    return i;
                 }
-            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
-                // Truncated 4-byte sequence.
-                return i;
             }
-        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
-            // Truncated 3- or 4-byte sequence.
-            return i;
         }
     }
     return orig_i;
diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c
index 0bbb5e5413d..b7062e3b82f 100644
--- a/icu4c/source/test/cintltst/utf8tst.c
+++ b/icu4c/source/test/cintltst/utf8tst.c
@@ -94,6 +94,7 @@ static void TestFwdBack(void);
 static void TestFwdBackUnsafe(void);
 static void TestSetChar(void);
 static void TestSetCharUnsafe(void);
+static void TestTruncateIfIncomplete(void);
 static void TestAppendChar(void);
 static void TestAppend(void);
 static void TestSurrogates(void);
@@ -114,6 +115,7 @@ addUTF8Test(TestNode** root)
     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
+    addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");
     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
@@ -927,6 +929,64 @@ static void TestSetCharUnsafe() {
     }
 }
 
+static void TestTruncateIfIncomplete() {
+    // Difference from U8_SET_CP_START():
+    // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
+    // Therefore, if the last byte is a lead byte, then this macro truncates
+    // even if the byte at the input index cannot continue a valid sequence
+    // (including when that is not a trail byte).
+    // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
+    static const struct {
+        const char *s;
+        int32_t expected;
+    } cases[] = {
+        { "", 0 },
+        { "a", 1 },
+        { "\x80", 1 },
+        { "\xC1", 1 },
+        { "\xC2", 0 },
+        { "\xE0", 0 },
+        { "\xF4", 0 },
+        { "\xF5", 1 },
+        { "\x80\x80", 2 },
+        { "\xC2\xA0", 2 },
+        { "\xE0\x9F", 2 },
+        { "\xE0\xA0", 0 },
+        { "\xED\x9F", 0 },
+        { "\xED\xA0", 2 },
+        { "\xF0\x8F", 2 },
+        { "\xF0\x90", 0 },
+        { "\xF4\x8F", 0 },
+        { "\xF4\x90", 2 },
+        { "\xF5\x80", 2 },
+        { "\x80\x80\x80", 3 },
+        { "\xC2\xA0\x80", 3 },
+        { "\xE0\xA0\x80", 3 },
+        { "\xF0\x8F\x80", 3 },
+        { "\xF0\x90\x80", 0 },
+        { "\xF4\x8F\x80", 0 },
+        { "\xF4\x90\x80", 3 },
+        { "\xF5\x80\x80", 3 },
+        { "\x80\x80\x80\x80", 4 },
+        { "\xC2\xA0\x80\x80", 4 },
+        { "\xE0\xA0\x80\x80", 4 },
+        { "\xF0\x90\x80\x80", 4 },
+        { "\xF5\x80\x80\x80", 4 }
+    };
+    int32_t i;
+    for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
+        const char *s = cases[i].s;
+        int32_t expected = cases[i].expected;
+        int32_t length = (int32_t)strlen(s);
+        int32_t adjusted = length;
+        U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
+        if (adjusted != expected) {
+            log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
+                    (int)i, (int)length, (int)expected, (int)adjusted);
+        }
+    }
+}
+
 static void TestAppendChar(){
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};