ICU-13515 UTF-8 macro: reduce length of string if it ends with an incomplete sequence

author Markus Scherer <markus.icu@gmail.com>

Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)
diff --git a/icu4c/source/common/ucnv_u8.cpp b/icu4c/source/common/ucnv_u8.cpp

index 094e2dfb6f43279885dde600400da5cc2af10dfe..7089d9400c6e95a496162d4bbfcd7d6ab50a9b89 100644 (file)
--- a/icu4c/source/common/ucnv_u8.cpp
+++ b/icu4c/source/common/ucnv_u8.cpp
@@ -696,36 +696,20 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
          // Use a single counter for source and target, counting the minimum of
          // the source length and the target capacity.
          // Let the standard converter handle edge cases.
-        const uint8_t *limit=sourceLimit;
          if(count>targetCapacity) {
-            limit-=(count-targetCapacity);
              count=targetCapacity;
          }
  
-        // The conversion loop checks count>0 only once per 1/2/3-byte character.
-        // If the buffer ends with a truncated 2- or 3-byte sequence,
+        // The conversion loop checks count>0 only once per character.
+        // If the buffer ends with a truncated sequence,
          // then we reduce the count to stop before that,
          // and collect the remaining bytes after the conversion loop.
-        {
-            // Do not go back into the bytes that will be read for finishing a partial
-            // sequence from the previous buffer.
-            int32_t length=count-toULimit;
-            if(length>0) {
-                uint8_t b1=*(limit-1);
-                if(U8_IS_SINGLE(b1)) {
-                    // common ASCII character
-                } else if(U8_IS_TRAIL(b1) && length>=2) {
-                    uint8_t b2=*(limit-2);
-                    if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
-                        // truncated 3-byte sequence
-                        count-=2;
-                    }
-                } else if(0xc2<=b1 && b1<0xf0) {
-                    // truncated 2- or 3-byte sequence
-                    --count;
-                }
-            }
-        }
+
+        // Do not go back into the bytes that will be read for finishing a partial
+        // sequence from the previous buffer.
+        int32_t length=count-toULimit;
+        U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
+        count=toULimit+length;
      }
  
      if(c!=0) {
@@ -815,7 +799,7 @@ moreBytes:
              }
  
              /* copy the legal byte sequence to the target */
-            if(count>=toULength) {
+            {
                  int8_t i;
  
                  for(i=0; i<oldToULength; ++i) {
@@ -826,14 +810,6 @@ moreBytes:
                      *target++=*source++;
                  }
                  count-=toULength;
-            } else {
-                // A supplementary character that does not fit into the target.
-                // Let the standard converter handle this.
-                source-=(toULength-oldToULength);
-                pToUArgs->source=(char *)source;
-                pFromUArgs->target=(char *)target;
-                *pErrorCode=U_USING_DEFAULT_WARNING;
-                return;
              }
          }
      }
@@ -857,8 +833,7 @@ moreBytes:
                          utf8->toULength=toULength;
                          utf8->mode=toULimit;
                          break;
-                    } else if(!U8_IS_TRAIL(b=*source)) {
-                        /* lead byte in trail byte position */
+                    } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
                          utf8->toULength=toULength;
                          *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                          break;
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h

index 021fdcf1f244e4fdaf9f392c6cc43dbd9166925d..555e8bd4e28e2ccff2de885515f7da6730d8cfcc 100644 (file)
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -592,12 +592,15 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   * If the offset points to a UTF-8 trail byte,
   * then the offset is moved backward to the corresponding lead byte.
   * Otherwise, it is not modified.
+ *
   * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
   *
   * @param s const uint8_t * string
   * @param start int32_t starting string offset (usually 0)
   * @param i int32_t string offset, must be start<=i
   * @see U8_SET_CP_START_UNSAFE
+ * @see U8_TRUNCATE_IF_INCOMPLETE
   * @stable ICU 2.4
   */
  #define U8_SET_CP_START(s, start, i) { \
@@ -606,6 +609,51 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
      } \
  }
  
+/**
+ * If the string ends with a UTF-8 byte sequence that is valid so far
+ * but incomplete, then reduce the length of the string to end before
+ * the lead byte of that incomplete sequence.
+ * For example, if the string ends with E1 80, the length is reduced by 2.
+ *
+ * Useful for processing text split across multiple buffers
+ * (save the incomplete sequence for later)
+ * and for optimizing iteration
+ * (check for string length only once per character).
+ *
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_SET_CP_START(), this macro never reads s[length].
+ *
+ * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param length int32_t string length, must be start<=length
+ * @see U8_SET_CP_START
+ * @draft ICU 61
+ */
+#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) \
+    if((length)>(start)) { \
+        uint8_t __b1=s[(length)-1]; \
+        if(U8_IS_SINGLE(__b1)) { \
+            /* common ASCII character */ \
+        } else if(U8_IS_LEAD(__b1)) { \
+            --(length); \
+        } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
+            uint8_t __b2=s[(length)-2]; \
+            if(0xe0<=__b2 && __b2<=0xf4) { \
+                if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
+                        U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
+                    (length)-=2; \
+                } \
+            } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
+                uint8_t __b3=s[(length)-3]; \
+                if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
+                    (length)-=3; \
+                } \
+            } \
+        } \
+    }
+
  /* definitions with backward iteration -------------------------------------- */
  
  /**
diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp

index f78c566e0988843b715965ee81232cb7dffd6651..9dd241a12bfa16788e4a6aeb06488c4df9a12a13 100644 (file)
--- a/icu4c/source/common/utf_impl.cpp
+++ b/icu4c/source/common/utf_impl.cpp
@@ -238,33 +238,45 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
      int32_t i=*pi;
      if(U8_IS_TRAIL(c) && i>start) {
          uint8_t b1=s[--i];
-        if(0xc2<=b1 && b1<0xe0) {
-            *pi=i;
-            return ((b1-0xc0)<<6)|(c&0x3f);
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0) {
+                *pi=i;
+                return ((b1-0xc0)<<6)|(c&0x3f);
+            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+                // Truncated 3- or 4-byte sequence.
+                *pi=i;
+                return errorValue(1, strict);
+            }
          } else if(U8_IS_TRAIL(b1) && i>start) {
              // Extract the value bits from the last trail byte.
              c&=0x3f;
              uint8_t b2=s[--i];
-            if(0xe0<=b2 && b2<0xf0) {
-                b2&=0xf;
-                if(strict!=-2) {
-                    if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
-                        *pi=i;
-                        c=(b2<<12)|((b1&0x3f)<<6)|c;
-                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
-                            return c;
-                        } else {
-                            // strict: forbid non-characters like U+fffe
-                            return errorValue(2, strict);
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0) {
+                    b2&=0xf;
+                    if(strict!=-2) {
+                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                            *pi=i;
+                            c=(b2<<12)|((b1&0x3f)<<6)|c;
+                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                                return c;
+                            } else {
+                                // strict: forbid non-characters like U+fffe
+                                return errorValue(2, strict);
+                            }
+                        }
+                    } else {
+                        // strict=-2 -> lenient: allow surrogates
+                        b1-=0x80;
+                        if((b2>0 || b1>=0x20)) {
+                            *pi=i;
+                            return (b2<<12)|(b1<<6)|c;
                          }
                      }
-                } else {
-                    // strict=-2 -> lenient: allow surrogates
-                    b1-=0x80;
-                    if((b2>0 || b1>=0x20)) {
-                        *pi=i;
-                        return (b2<<12)|(b1<<6)|c;
-                    }
+                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    // Truncated 4-byte sequence.
+                    *pi=i;
+                    return errorValue(2, strict);
                  }
              } else if(U8_IS_TRAIL(b2) && i>start) {
                  uint8_t b3=s[--i];
@@ -281,16 +293,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                          }
                      }
                  }
-            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
-                // Truncated 4-byte sequence.
-                *pi=i;
-                return errorValue(2, strict);
              }
-        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
-            // Truncated 3- or 4-byte sequence.
-            *pi=i;
-            return errorValue(1, strict);
          }
      }
      return errorValue(0, strict);
@@ -303,29 +306,23 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
      uint8_t c=s[i];
      if(U8_IS_TRAIL(c) && i>start) {
          uint8_t b1=s[--i];
-        if(0xc2<=b1 && b1<0xe0) {
-            return i;
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0 ||
+                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+                return i;
+            }
          } else if(U8_IS_TRAIL(b1) && i>start) {
              uint8_t b2=s[--i];
-            if(0xe0<=b2 && b2<0xf0) {
-                if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
                      return i;
                  }
              } else if(U8_IS_TRAIL(b2) && i>start) {
                  uint8_t b3=s[--i];
-                if(0xf0<=b3 && b3<=0xf4) {
-                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
-                        return i;
-                    }
+                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                    return i;
                  }
-            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
-                // Truncated 4-byte sequence.
-                return i;
              }
-        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
-                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
-            // Truncated 3- or 4-byte sequence.
-            return i;
          }
      }
      return orig_i;
diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c

index 0bbb5e5413dc76e516338f98d4f977149c80109f..b7062e3b82f5030b27b291d9b51544e8c53c8f3c 100644 (file)
--- a/icu4c/source/test/cintltst/utf8tst.c
+++ b/icu4c/source/test/cintltst/utf8tst.c
@@ -94,6 +94,7 @@ static void TestFwdBack(void);
  static void TestFwdBackUnsafe(void);
  static void TestSetChar(void);
  static void TestSetCharUnsafe(void);
+static void TestTruncateIfIncomplete(void);
  static void TestAppendChar(void);
  static void TestAppend(void);
  static void TestSurrogates(void);
@@ -114,6 +115,7 @@ addUTF8Test(TestNode** root)
      addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
      addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
      addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
+    addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");
      addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
      addTest(root, &TestAppend,                  "utf8tst/TestAppend");
      addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
@@ -927,6 +929,64 @@ static void TestSetCharUnsafe() {
      }
  }
  
+static void TestTruncateIfIncomplete() {
+    // Difference from U8_SET_CP_START():
+    // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
+    // Therefore, if the last byte is a lead byte, then this macro truncates
+    // even if the byte at the input index cannot continue a valid sequence
+    // (including when that is not a trail byte).
+    // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
+    static const struct {
+        const char *s;
+        int32_t expected;
+    } cases[] = {
+        { "", 0 },
+        { "a", 1 },
+        { "\x80", 1 },
+        { "\xC1", 1 },
+        { "\xC2", 0 },
+        { "\xE0", 0 },
+        { "\xF4", 0 },
+        { "\xF5", 1 },
+        { "\x80\x80", 2 },
+        { "\xC2\xA0", 2 },
+        { "\xE0\x9F", 2 },
+        { "\xE0\xA0", 0 },
+        { "\xED\x9F", 0 },
+        { "\xED\xA0", 2 },
+        { "\xF0\x8F", 2 },
+        { "\xF0\x90", 0 },
+        { "\xF4\x8F", 0 },
+        { "\xF4\x90", 2 },
+        { "\xF5\x80", 2 },
+        { "\x80\x80\x80", 3 },
+        { "\xC2\xA0\x80", 3 },
+        { "\xE0\xA0\x80", 3 },
+        { "\xF0\x8F\x80", 3 },
+        { "\xF0\x90\x80", 0 },
+        { "\xF4\x8F\x80", 0 },
+        { "\xF4\x90\x80", 3 },
+        { "\xF5\x80\x80", 3 },
+        { "\x80\x80\x80\x80", 4 },
+        { "\xC2\xA0\x80\x80", 4 },
+        { "\xE0\xA0\x80\x80", 4 },
+        { "\xF0\x90\x80\x80", 4 },
+        { "\xF5\x80\x80\x80", 4 }
+    };
+    int32_t i;
+    for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
+        const char *s = cases[i].s;
+        int32_t expected = cases[i].expected;
+        int32_t length = (int32_t)strlen(s);
+        int32_t adjusted = length;
+        U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
+        if (adjusted != expected) {
+            log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
+                    (int)i, (int)length, (int)expected, (int)adjusted);
+        }
+    }
+}
+
  static void TestAppendChar(){
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
      static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 9 Feb 2018 21:01:56 +0000 (21:01 +0000)
icu4c/source/common/ucnv_u8.cpp		patch \| blob \| history
icu4c/source/common/unicode/utf8.h		patch \| blob \| history
icu4c/source/common/utf_impl.cpp		patch \| blob \| history
icu4c/source/test/cintltst/utf8tst.c		patch \| blob \| history