ICU-13311 change illegal-UTF-8 handling in non-converter code

author Markus Scherer <markus.icu@gmail.com>

Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)
author Markus Scherer <markus.icu@gmail.com>
Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)
diff --git a/icu4c/source/common/bmpset.cpp b/icu4c/source/common/bmpset.cpp

index 08f9bed0664bb5f2059fbe8bc0d764b585834bbe..f84bfd7f5bfcf1044aff8e349226cd04c24f0f59 100644 (file)
--- a/icu4c/source/common/bmpset.cpp
+++ b/icu4c/source/common/bmpset.cpp
@@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
  
  BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
          list(parentList), listLength(parentListLength) {
-    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
+    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
      uprv_memset(table7FF, 0, sizeof(table7FF));
      uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
  
@@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
          list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
      }
      list4kStarts[0x11]=listLength-1;
+    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
  
      initBits();
      overrideIllegal();
  }
  
  BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+        containsFFFD(otherBMPSet.containsFFFD),
          list(newParentList), listLength(newParentListLength) {
-    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
+    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
      uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
      uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
      uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@@ -120,7 +122,7 @@ void BMPSet::initBits() {
      UChar32 start, limit;
      int32_t listIndex=0;
  
-    // Set asciiBytes[].
+    // Set latin1Contains[].
      do {
          start=list[listIndex++];
          if(listIndex<listLength) {
@@ -128,13 +130,30 @@ void BMPSet::initBits() {
          } else {
              limit=0x110000;
          }
-        if(start>=0x80) {
+        if(start>=0x100) {
              break;
          }
          do {
-            asciiBytes[start++]=1;
-        } while(start<limit && start<0x80);
-    } while(limit<=0x80);
+            latin1Contains[start++]=1;
+        } while(start<limit && start<0x100);
+    } while(limit<=0x100);
+
+    // Find the first range overlapping with (or after) 80..FF again,
+    // to include them in table7FF as well.
+    for(listIndex=0;;) {
+        start=list[listIndex++];
+        if(listIndex<listLength) {
+            limit=list[listIndex++];
+        } else {
+            limit=0x110000;
+        }
+        if(limit>0x80) {
+            if(start<0x80) {
+                start=0x80;
+            }
+            break;
+        }
+    }
  
      // Set table7FF[].
      while(start<0x800) {
@@ -204,19 +223,14 @@ void BMPSet::initBits() {
   * for faster validity checking at runtime.
   * No need to set 0 values where they were reset to 0 in the constructor
   * and not modified by initBits().
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
   * Need to set 0 values for surrogates D800..DFFF.
   */
  void BMPSet::overrideIllegal() {
      uint32_t bits, mask;
      int32_t i;
  
-    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
-        // contains(FFFD)==TRUE
-        for(i=0x80; i<0xc0; ++i) {
-            asciiBytes[i]=1;
-        }
-
+    if(containsFFFD) {
          bits=3;                 // Lead bytes 0xC0 and 0xC1.
          for(i=0; i<64; ++i) {
              table7FF[i]|=bits;
@@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
              bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
          }
      } else {
-        // contains(FFFD)==FALSE
          mask=~(0x10001<<0xd);   // Lead byte 0xED.
          for(i=32; i<64; ++i) {  // Second half of 4k block.
              bmpBlockBits[i]&=mask;
@@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
  
  UBool
  BMPSet::contains(UChar32 c) const {
-    if((uint32_t)c<=0x7f) {
-        return (UBool)asciiBytes[c];
+    if((uint32_t)c<=0xff) {
+        return (UBool)latin1Contains[c];
      } else if((uint32_t)c<=0x7ff) {
          return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
      } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
          // span
          do {
              c=*s;
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                      break;
                  }
              } else if(c<=0x7ff) {
@@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
          // span not
          do {
              c=*s;
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                      break;
                  }
              } else if(c<=0x7ff) {
@@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
          // span
          for(;;) {
              c=*(--limit);
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                      break;
                  }
              } else if(c<=0x7ff) {
@@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
          // span not
          for(;;) {
              c=*(--limit);
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                      break;
                  }
              } else if(c<=0x7ff) {
@@ -497,22 +510,22 @@ const uint8_t *
  BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
      const uint8_t *limit=s+length;
      uint8_t b=*s;
-    if((int8_t)b>=0) {
+    if(U8_IS_SINGLE(b)) {
          // Initial all-ASCII span.
          if(spanCondition) {
              do {
-                if(!asciiBytes[b] || ++s==limit) {
+                if(!latin1Contains[b] || ++s==limit) {
                      return s;
                  }
                  b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
          } else {
              do {
-                if(asciiBytes[b] || ++s==limit) {
+                if(latin1Contains[b] || ++s==limit) {
                      return s;
                  }
                  b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
          }
          length=(int32_t)(limit-s);
      }
@@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
              // single trail byte, check for preceding 3- or 4-byte lead byte
              if(length>=2 && (b=*(limit-2))>=0xe0) {
                  limit-=2;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                      limit0=limit;
                  }
              } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
                  // 4-byte lead byte with only two trail bytes
                  limit-=3;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                      limit0=limit;
                  }
              }
          } else {
              // lead byte with no trail bytes
              --limit;
-            if(asciiBytes[0x80]!=spanCondition) {
+            if(containsFFFD!=spanCondition) {
                  limit0=limit;
              }
          }
@@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
  
      while(s<limit) {
          b=*s;
-        if(b<0xc0) {
-            // ASCII; or trail bytes with the result of contains(FFFD).
+        if(U8_IS_SINGLE(b)) {
+            // ASCII
              if(spanCondition) {
                  do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                          return s;
                      } else if(++s==limit) {
                          return limit0;
                      }
                      b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
              } else {
                  do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                          return s;
                      } else if(++s==limit) {
                          return limit0;
                      }
                      b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
              }
          }
          ++s;  // Advance past the lead byte.
@@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                  UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
                  if( (   (0x10000<=c && c<=0x10ffff) ?
                              containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
-                            asciiBytes[0x80]
+                            containsFFFD
                      ) != spanCondition
                  ) {
                      return s-1;
@@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                  s+=3;
                  continue;
              }
-        } else /* 0xc0<=b<0xe0 */ {
+        } else {
              if( /* handle U+0000..U+07FF inline */
+                b>=0xc0 &&
                  (t1=(uint8_t)(*s-0x80)) <= 0x3f
              ) {
                  if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
          // Give an illegal sequence the same value as the result of contains(FFFD).
          // Handle each byte of an illegal sequence separately to simplify the code;
          // no need to optimize error handling.
-        if(asciiBytes[0x80]!=spanCondition) {
+        if(containsFFFD!=spanCondition) {
              return s-1;
          }
      }
@@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
  
      do {
          b=s[--length];
-        if((int8_t)b>=0) {
+        if(U8_IS_SINGLE(b)) {
              // ASCII sub-span
              if(spanCondition) {
                  do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                          return length+1;
                      } else if(length==0) {
                          return 0;
                      }
                      b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
              } else {
                  do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                          return length+1;
                      } else if(length==0) {
                          return 0;
                      }
                      b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
              }
          }
  
diff --git a/icu4c/source/common/bmpset.h b/icu4c/source/common/bmpset.h

index 87375d2cace07032525626bf3ce2e4cd70b02b46..018aeb7f95b078733512cd59c857c29d87959ce8 100644 (file)
--- a/icu4c/source/common/bmpset.h
+++ b/icu4c/source/common/bmpset.h
@@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
   * Helper class for frozen UnicodeSets, implements contains() and span()
   * optimized for BMP code points. Structured to be UTF-8-friendly.
   *
- * ASCII: Look up bytes.
+ * Latin-1: Look up bytes.
   * 2-byte characters: Bits organized vertically.
   * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
   *                    with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
   */
  class BMPSet : public UMemory {
  public:
@@ -96,12 +97,12 @@ private:
      inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
  
      /*
-     * One byte per ASCII character, or trail byte in lead position.
-     * 0 or 1 for ASCII characters.
-     * The value for trail bytes is the result of contains(FFFD)
-     * for faster validity checking at runtime.
+     * One byte 0 or 1 per Latin-1 character.
       */
-    UBool asciiBytes[0xc0];
+    UBool latin1Contains[0x100];
+
+    /* TRUE if contains(U+FFFD). */
+    UBool containsFFFD;
  
      /*
       * One bit per code point from U+0000..U+07FF.
diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h

index dff4286815defb54b84db5d1966afa36428e84d3..ef512997f05a15aca1b6a4ac36755de1d2b71088 100644 (file)
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@@ -23,9 +23,6 @@
   * This file defines macros for checking whether a code point is
   * a surrogate or a non-character etc.
   *
- * The UChar and UChar32 data types for Unicode code units and code points
- * are defined in umachine.h because they can be machine-dependent.
- *
   * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
   * and itself includes utf8.h and utf16.h after some
   * common definitions.
@@ -50,11 +47,11 @@
   * but are optimized for the much more frequently occurring BMP code points.
   *
   * umachine.h defines UChar to be an unsigned 16-bit integer.
- * Where available, UChar is defined to be a char16_t
- * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
+ * Since ICU 59, ICU uses char16_t in C++, UChar only in C,
+ * and defines UChar=char16_t by default. See the UChar API docs for details.
   *
   * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
- * Unicode code point (Unicode scalar value, 0..0x10ffff).
+ * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
   * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
   * the definition of UChar. For details see the documentation for UChar32 itself.
   *
@@ -63,11 +60,20 @@
   * For actual Unicode character properties see uchar.h.
   *
   * By default, string operations must be done with error checking in case
- * a string is not well-formed UTF-16.
- * The macros will detect if a surrogate code unit is unpaired
+ * a string is not well-formed UTF-16 or UTF-8.
+ *
+ * The U16_ macros detect if a surrogate code unit is unpaired
   * (lead unit without trail unit or vice versa) and just return the unit itself
   * as the code point.
   *
+ * The U8_ macros detect illegal byte sequences and return a negative value.
+ * Starting with ICU 60, the observable length of a single illegal byte sequence
+ * skipped by one of these macros follows the Unicode 6+ recommendation
+ * which is consistent with the W3C Encoding Standard.
+ *
+ * There are ..._OR_FFFD versions of both U16_ and U8_ macros
+ * that return U+FFFD for illegal code unit sequences.
+ *
   * The regular "safe" macros require that the initial, passed-in string index
   * is within bounds. They only check the index when they read more than one
   * code unit. This is usually done with code similar to the following loop:
@@ -91,10 +97,7 @@
   * The performance differences are much larger here because UTF-8 provides so
   * many opportunities for malformed sequences.
   * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
- * and are fast, while the safe UTF-8 macros call functions for all but the
- * trivial (ASCII) cases.
- * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
- * characters inline as well.)
+ * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
   *
   * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
   * code point values (0..U+10ffff). They are indicated with negative values instead.
@@ -126,8 +129,7 @@
   */
  #define U_IS_UNICODE_NONCHAR(c) \
      ((c)>=0xfdd0 && \
-     ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
-     (uint32_t)(c)<=0x10ffff)
+     ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
  
  /**
   * Is c a Unicode code point value (0..U+10ffff)
@@ -148,9 +150,7 @@
   */
  #define U_IS_UNICODE_CHAR(c) \
      ((uint32_t)(c)<0xd800 || \
-        ((uint32_t)(c)>0xdfff && \
-         (uint32_t)(c)<=0x10ffff && \
-         !U_IS_UNICODE_NONCHAR(c)))
+        (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
  
  /**
   * Is this code point a BMP code point (U+0000..U+ffff)?
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h

index 18e7e9455c97f9fe74b64891ea1efc7c4a75a00d..55024fdbfe5883f98cc839fe526b72bb8323d3c2 100644 (file)
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -41,34 +41,24 @@
  
  /* internal definitions ----------------------------------------------------- */
  
-
-
  /**
   * Counts the trail bytes for a UTF-8 lead byte.
- * Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
+ * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
   *
   * This is internal since it is not meant to be called directly by external clients;
   * however it is called by public macros in this file and thus must remain stable.
   *
- * Note: Beginning with ICU 50, the implementation uses a multi-condition expression
- * which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
- * leadByte is evaluated multiple times.
- *
- * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
- * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
- * leadByte was evaluated exactly once.
- *
   * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
   * @internal
   */
  #define U8_COUNT_TRAIL_BYTES(leadByte) \
-    ((uint8_t)(leadByte)<0xf0 ? \
-        ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
-        (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
+    ((uint8_t)(leadByte)<=0xf4 ? \
+        ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)
  
  /**
   * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
- * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
+ * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
   * leadByte might be evaluated multiple times.
   *
   * This is internal since it is not meant to be called directly by external clients;
@@ -78,7 +68,7 @@
   * @internal
   */
  #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
-    (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
+    (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
  
  /**
   * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
@@ -89,6 +79,34 @@
   */
  #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
  
+/**
+ * Internal bit vector for 3-byte UTF-8 validity check.
+ * Lead byte E0..EF bits 3..0 as byte index,
+ * first trail byte bits 7..5 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
+
+/**
+ * Internal 3-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
+
+/**
+ * Internal bit vector for 4-byte UTF-8 validity check.
+ * First trail byte bits 7..4 as byte index,
+ * lead byte F0..F4 bits 2..0 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
+
+/**
+ * Internal 4-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
+
  /**
   * Function for handling "next code point" with error-checking.
   *
@@ -153,7 +171,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   * @return TRUE or FALSE
   * @stable ICU 2.4
   */
-#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
+#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
+// 0x32=0xf4-0xc2
  
  /**
   * Is this code unit (byte) a UTF-8 trail byte?
@@ -161,7 +180,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   * @return TRUE or FALSE
   * @stable ICU 2.4
   */
-#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
  
  /**
   * How many code units (bytes) are used for the UTF-8 encoding
@@ -289,7 +308,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   */
  #define U8_NEXT_UNSAFE(s, i, c) { \
      (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
          if((c)<0xe0) { \
              (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
          } else if((c)<0xf0) { \
@@ -325,22 +344,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   */
  #define U8_NEXT(s, i, length, c) { \
      (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
          uint8_t __t1, __t2; \
-        if( /* handle U+1000..U+CFFF inline */ \
-            (0xe0<(c) && (c)<=0xec) && \
-            (((i)+1)<(length) || (length)<0) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
-            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
-        ) { \
-            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
-            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+        if( /* handle U+0800..U+FFFF inline */ \
+                (0xe0<=(c) && (c)<0xf0) && \
+                (((i)+1)<(length) || (length)<0) && \
+                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
              (i)+=2; \
          } else if( /* handle U+0080..U+07FF inline */ \
-            ((c)<0xe0 && (c)>=0xc2) && \
-            ((i)!=(length)) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
-        ) { \
+                ((c)<0xe0 && (c)>=0xc2) && \
+                ((i)!=(length)) && \
+                (__t1=(s)[i]-0x80)<=0x3f) { \
              (c)=(((c)&0x1f)<<6)|__t1; \
              ++(i); \
          } else { \
@@ -376,22 +392,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   */
  #define U8_NEXT_OR_FFFD(s, i, length, c) { \
      (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
          uint8_t __t1, __t2; \
-        if( /* handle U+1000..U+CFFF inline */ \
-            (0xe0<(c) && (c)<=0xec) && \
-            (((i)+1)<(length) || (length)<0) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
-            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
-        ) { \
-            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
-            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+        if( /* handle U+0800..U+FFFF inline */ \
+                (0xe0<=(c) && (c)<0xf0) && \
+                (((i)+1)<(length) || (length)<0) && \
+                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
              (i)+=2; \
          } else if( /* handle U+0080..U+07FF inline */ \
-            ((c)<0xe0 && (c)>=0xc2) && \
-            ((i)!=(length)) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
-        ) { \
+                ((c)<0xe0 && (c)>=0xc2) && \
+                ((i)!=(length)) && \
+                (__t1=(s)[i]-0x80)<=0x3f) { \
              (c)=(((c)&0x1f)<<6)|__t1; \
              ++(i); \
          } else { \
@@ -476,7 +489,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   * @stable ICU 2.4
   */
  #define U8_FWD_1_UNSAFE(s, i) { \
-    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
+    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
  }
  
  /**
@@ -493,15 +506,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   * @stable ICU 2.4
   */
  #define U8_FWD_1(s, i, length) { \
-    uint8_t __b=(uint8_t)(s)[(i)++]; \
-    if(U8_IS_LEAD(__b)) { \
-        uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
-        if((i)+__count>(length) && (length)>=0) { \
-            __count=(uint8_t)((length)-(i)); \
-        } \
-        while(__count>0 && U8_IS_TRAIL((s)[i])) { \
-            ++(i); \
-            --__count; \
+    uint8_t __b=(s)[(i)++]; \
+    if(U8_IS_LEAD(__b) && (i)!=(length)) { \
+        uint8_t __t1=(s)[i]; \
+        if((0xe0<=__b && __b<0xf0)) { \
+            if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+                ++(i); \
+            } \
+        } else if(__b<0xe0) { \
+            if(U8_IS_TRAIL(__t1)) { \
+                ++(i); \
+            } \
+        } else /* c>=0xf0 */ { \
+            if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+                ++(i); \
+            } \
          } \
      } \
  }
@@ -615,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
          /* c is a trail byte */ \
          (c)&=0x3f; \
          for(;;) { \
-            __b=(uint8_t)(s)[--(i)]; \
+            __b=(s)[--(i)]; \
              if(__b>=0xc0) { \
                  U8_MASK_LEAD_BYTE(__b, __count); \
                  (c)|=(UChar32)__b<<__shift; \
@@ -651,7 +673,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   */
  #define U8_PREV(s, start, i, c) { \
      (c)=(uint8_t)(s)[--(i)]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
          (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
      } \
  }
@@ -682,7 +704,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
   */
  #define U8_PREV_OR_FFFD(s, start, i, c) { \
      (c)=(uint8_t)(s)[--(i)]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
          (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
      } \
  }
diff --git a/icu4c/source/common/unisetspan.cpp b/icu4c/source/common/unisetspan.cpp

index 83bc7945faa26ff857b186c91c3ac0c09f18ac33..0a8893472f958b3fbed6f969e735508e097c829b 100644 (file)
--- a/icu4c/source/common/unisetspan.cpp
+++ b/icu4c/source/common/unisetspan.cpp
@@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
  static inline int32_t
  spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
      UChar32 c=*s;
-    if((int8_t)c>=0) {
+    if(U8_IS_SINGLE(c)) {
          return set.contains(c) ? 1 : -1;
      }
      // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
@@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
  static inline int32_t
  spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
      UChar32 c=s[length-1];
-    if((int8_t)c>=0) {
+    if(U8_IS_SINGLE(c)) {
          return set.contains(c) ? 1 : -1;
      }
      int32_t i=length-1;
@@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
                      // Try to match if the increment is not listed already.
                      // Match at code point boundaries. (The UTF-8 strings were converted
                      // from UTF-16 and are guaranteed to be well-formed.)
-                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
-                        !offsets.containsOffset(inc) &&
-                        matches8(s+pos-overlap, s8, length8)
-                        
-                    ) {
+                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
+                            !offsets.containsOffset(inc) &&
+                            matches8(s+pos-overlap, s8, length8)) {
                          if(inc==rest) {
                              return length;  // Reached the end of the string.
                          }
@@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
                      // Try to match if the string is longer or starts earlier.
                      // Match at code point boundaries. (The UTF-8 strings were converted
                      // from UTF-16 and are guaranteed to be well-formed.)
-                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
-                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
-                        matches8(s+pos-overlap, s8, length8)
-                        
-                    ) {
+                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
+                            (overlap>maxOverlap ||
+                                /* redundant overlap==maxOverlap && */ inc>maxInc) &&
+                            matches8(s+pos-overlap, s8, length8)) {
                          maxInc=inc;  // Longest match from earliest start.
                          maxOverlap=overlap;
                          break;
diff --git a/icu4c/source/common/ustrtrns.cpp b/icu4c/source/common/ustrtrns.cpp

index 09eca22fda31828946e83c1d2968854ac86c5fc2..5dc032c02fb807413495d7daf4db668cfd16518b 100644 (file)
--- a/icu4c/source/common/ustrtrns.cpp
+++ b/icu4c/source/common/ustrtrns.cpp
@@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
              pErrorCode);
  }
  
-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count<6);
-    U8_MASK_LEAD_BYTE((c), count);
-    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-    switch(count) {
-    /* each branch falls through to the next one */
-    case 5:
-    case 4:
-        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-        illegal=1;
-        break;
-    case 3:
-        trail=(uint8_t)(*s++ - 0x80);
-        c=(c<<6)|trail;
-        if(trail>0x3f || c>=0x110) {
-            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
-            illegal=1;
-            break;
-        }
-        U_FALLTHROUGH;
-    case 2:
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-            break;
-        }
-        c=(c<<6)|trail;
-        U_FALLTHROUGH;
-    case 1:
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-        }
-        c=(c<<6)|trail;
-        break;
-    case 0:
-        return U_SENTINEL;
-    /* no default branch to optimize switch()  - all values are covered */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    if((limit-s)>=count) {
-        U8_MASK_LEAD_BYTE((c), count);
-        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            illegal=1;
-            break;
-        case 3:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            if(c<0x110) {
-                illegal|=(trail&0xc0)^0x80;
-            } else {
-                /* code point>0x10ffff, outside Unicode */
-                illegal=1;
-                break;
-            }
-            U_FALLTHROUGH;
-        case 2:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            U_FALLTHROUGH;
-        case 1:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            break;
-        case 0:
-            return U_SENTINEL;
-        /* no default branch to optimize switch()  - all values are covered */
-        }
-    } else {
-        illegal=1; /* too few bytes left */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
  U_CAPI UChar* U_EXPORT2
  u_strFromUTF8WithSub(UChar *dest,
                int32_t destCapacity,
@@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
                int32_t srcLength,
                UChar32 subchar, int32_t *pNumSubstitutions,
                UErrorCode *pErrorCode){
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
      /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
          return NULL;
      }
-        
      if( (src==NULL && srcLength!=0) || srcLength < -1 ||
          (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
          subchar > 0x10ffff || U_IS_SURROGATE(subchar)
@@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
      if(pNumSubstitutions!=NULL) {
          *pNumSubstitutions=0;
      }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;
  
      /*
       * Inline processing of UTF-8 byte sequences:
@@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
           * The code explicitly checks for NULs only in the lead byte position.
           * A NUL byte in the trail byte position fails the trail byte range check anyway.
           */
-        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        int32_t i;
+        UChar32 c;
+        for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
              } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                } else if(ch<=0xFFFF) {
-                    *(pDest++)=(UChar)ch;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
                  } else {
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit) {
-                        *(pDest++)=U16_TRAIL(ch);
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
                      } else {
-                        reqLength++;
-                        break;
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                      }
                  }
              }
          }
  
          /* Pre-flight the rest of the string. */
-        while((ch = *pSrc) != 0) {
-            if(ch <= 0x7f){
+        while((c = (uint8_t)src[i]) != 0) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
                  ++reqLength;
-                ++pSrc;
              } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 2;
-                        continue;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
                      }
+                    reqLength += U16_LENGTH(c);
                  }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }
-                reqLength += U16_LENGTH(ch);
              }
          }
      } else /* srcLength >= 0 */ {
-        const uint8_t *pSrcLimit = pSrc + srcLength;
-        int32_t count;
-
-        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+        /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+        int32_t i = 0;
+        UChar32 c;
          for(;;) {
              /*
               * Each iteration of the inner loop progresses by at most 3 UTF-8
@@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
               * For supplementary code points (4 & 2), which are rare,
               * there is an additional adjustment.
               */
-            count = (int32_t)(pDestLimit - pDest);
-            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
-            if(count > srcLength) {
-                count = srcLength; /* min(remaining dest, remaining src/3) */
+            int32_t count = (int32_t)(pDestLimit - pDest);
+            int32_t count2 = (srcLength - i) / 3;
+            if(count > count2) {
+                count = count2; /* min(remaining dest, remaining src/3) */
              }
              if(count < 3) {
                  /*
@@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
              }
  
              do {
-                ch = *pSrc;
-                if(ch <= 0x7f){
-                    *pDest++=(UChar)ch;
-                    ++pSrc;
+                // modified copy of U8_NEXT()
+                c = (uint8_t)src[i++];
+                if(U8_IS_SINGLE(c)) {
+                    *pDest++=(UChar)c;
                  } else {
-                    if(ch > 0xe0) {
-                        if( /* handle U+1000..U+CFFF inline */
-                            ch <= 0xec &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                        ) {
-                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                            pSrc += 3;
-                            continue;
-                        }
-                    } else if(ch < 0xe0) {
-                        if( /* handle U+0080..U+07FF inline */
-                            ch >= 0xc2 &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                        ) {
-                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                            pSrc += 2;
-                            continue;
+                    uint8_t __t1, __t2;
+                    if( /* handle U+0800..U+FFFF inline */
+                            (0xe0<=(c) && (c)<0xf0) &&
+                            ((i)+1)<srcLength &&
+                            U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                            (__t2=src[(i)+1]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                        i+=2;
+                    } else if( /* handle U+0080..U+07FF inline */
+                            ((c)<0xe0 && (c)>=0xc2) &&
+                            ((i)!=srcLength) &&
+                            (__t1=src[i]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0x1f)<<6)|__t1;
+                        ++(i);
+                    } else {
+                        if(c >= 0xf0 || subchar > 0xffff) {
+                            // We may read up to four bytes and write up to two UChars,
+                            // which we didn't account for with computing count,
+                            // so we adjust it here.
+                            if(--count == 0) {
+                                --i;  // back out byte c
+                                break;
+                            }
                          }
-                    }
  
-                    if(ch >= 0xf0 || subchar > 0xffff) {
-                        /*
-                         * We may read up to six bytes and write up to two UChars,
-                         * which we didn't account for with computing count,
-                         * so we adjust it here.
-                         */
-                        if(--count == 0) {
-                            break;
+                        /* function call for "complicated" and error cases */
+                        (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                        if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                            *pErrorCode = U_INVALID_CHAR_FOUND;
+                            return NULL;
+                        } else if(c<=0xFFFF) {
+                            *(pDest++)=(UChar)c;
+                        } else {
+                            *(pDest++)=U16_LEAD(c);
+                            *(pDest++)=U16_TRAIL(c);
                          }
                      }
-
-                    /* function call for "complicated" and error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                        *pErrorCode = U_INVALID_CHAR_FOUND;
-                        return NULL;
-                    }else if(ch<=0xFFFF){
-                        *(pDest++)=(UChar)ch;
-                    }else{
-                        *(pDest++)=U16_LEAD(ch);
-                        *(pDest++)=U16_TRAIL(ch);
-                    }
                  }
              } while(--count > 0);
          }
  
-        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        while(i < srcLength && (pDest < pDestLimit)) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
              } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }else if(ch<=0xFFFF){
-                    *(pDest++)=(UChar)ch;
-                }else{
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit){
-                        *(pDest++)=U16_TRAIL(ch);
-                    }else{
-                        reqLength++;
-                        break;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
+                    } else {
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                      }
                  }
              }
          }
-        /* do not fill the dest buffer just count the UChars needed */
-        while(pSrc < pSrcLimit){
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                reqLength++;
-                ++pSrc;
+
+        /* Pre-flight the rest of the string. */
+        while(i < srcLength) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                ++reqLength;
              } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 2;
-                        continue;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
                      }
+                    reqLength += U16_LENGTH(c);
                  }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }
-                reqLength+=U16_LENGTH(ch);
              }
          }
      }
@@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
      uint8_t* pSrc = (uint8_t*) src;
  
      /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
          return NULL;
      }
          
@@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
      int32_t numSubstitutions;
  
      /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
          return NULL;
      }
          
@@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
          int32_t srcLength,
          UChar32 subchar, int32_t *pNumSubstitutions,
          UErrorCode *pErrorCode) {
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    const uint8_t *pSrcLimit;
-    int32_t count;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
      /* args check */
-    if(U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
          return NULL;
      }
      if( (src==NULL && srcLength!=0) || srcLength < -1 ||
@@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
      if(pNumSubstitutions!=NULL) {
          *pNumSubstitutions=0;
      }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;
  
      if(srcLength < 0) {
          /*
           * Transform a NUL-terminated ASCII string.
           * Handle non-ASCII strings with slower code.
           */
-        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
-            *pDest++=(UChar)ch;
-            ++pSrc;
+        UChar32 c;
+        while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
+            *pDest++=(UChar)c;
+            ++src;
          }
-        if(ch == 0) {
+        if(c == 0) {
              reqLength=(int32_t)(pDest - dest);
              if(pDestLength) {
                  *pDestLength = reqLength;
@@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
              u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
              return dest;
          }
-        srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
+        srcLength = static_cast<int32_t>(uprv_strlen(src));
      }
  
-    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
-    pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
+    /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+    UChar32 ch;
+    uint8_t t1, t2;
+    int32_t i = 0;
      for(;;) {
-        count = (int32_t)(pDestLimit - pDest);
-        srcLength = (int32_t)(pSrcLimit - pSrc);
-        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+        int32_t count = (int32_t)(pDestLimit - pDest);
+        int32_t count2 = srcLength - i;
+        if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
              /* fast ASCII loop */
-            const uint8_t *prevSrc = pSrc;
-            int32_t delta;
-            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
-                *pDest++=(UChar)ch;
-                ++pSrc;
+            int32_t start = i;
+            uint8_t b;
+            while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
+                *pDest++=b;
+                ++i;
              }
-            delta = (int32_t)(pSrc - prevSrc);
+            int32_t delta = i - start;
              count -= delta;
-            srcLength -= delta;
+            count2 -= delta;
          }
          /*
           * Each iteration of the inner loop progresses by at most 3 UTF-8
           * bytes and one UChar.
           */
-        srcLength /= 3;
-        if(count > srcLength) {
-            count = srcLength; /* min(remaining dest, remaining src/3) */
+        if(subchar > 0xFFFF) {
+            break;
+        }
+        count2 /= 3;
+        if(count > count2) {
+            count = count2; /* min(remaining dest, remaining src/3) */
          }
          if(count < 3) {
              /*
@@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
              break;
          }
          do {
-            ch = *pSrc;
-            if(ch <= 0x7f){
+            ch = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(ch)) {
                  *pDest++=(UChar)ch;
-                ++pSrc;
              } else {
                  if(ch >= 0xe0) {
                      if( /* handle U+0000..U+FFFF inline */
                          ch <= 0xef &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                        (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                      ) {
                          /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                          *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
+                        i += 2;
                          continue;
                      }
                  } else {
                      if( /* handle U+0000..U+07FF inline */
                          ch >= 0xc0 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                      ) {
                          *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
+                        ++i;
                          continue;
                      }
                  }
@@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
                       * We need to write two UChars, adjusted count for that,
                       * and ran out of space.
                       */
+                    --i;  // back out byte ch
                      break;
                  } else {
                      /* function call for error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                    utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                      ++numSubstitutions;
-                    if(subchar<=0xFFFF) {
-                        *(pDest++)=(UChar)subchar;
-                    } else {
-                        *(pDest++)=U16_LEAD(subchar);
-                        *(pDest++)=U16_TRAIL(subchar);
-                    }
+                    *(pDest++)=(UChar)subchar;
                  }
              }
          } while(--count > 0);
      }
  
-    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-        ch = *pSrc;
-        if(ch <= 0x7f){
+    while(i < srcLength && (pDest < pDestLimit)) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)){
              *pDest++=(UChar)ch;
-            ++pSrc;
          } else {
              if(ch >= 0xe0) {
                  if( /* handle U+0000..U+FFFF inline */
                      ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                    (i+1) < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                    (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                  ) {
                      /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                      *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                    pSrc += 3;
+                    i += 2;
                      continue;
                  }
              } else {
                  if( /* handle U+0000..U+07FF inline */
                      ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                    i < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                  ) {
                      *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                    pSrc += 2;
+                    ++i;
                      continue;
                  }
              }
@@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
                  return NULL;
              } else {
                  /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                  ++numSubstitutions;
                  if(subchar<=0xFFFF) {
                      *(pDest++)=(UChar)subchar;
@@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
          }
      }
  
-    /* do not fill the dest buffer just count the UChars needed */
-    while(pSrc < pSrcLimit){
-        ch = *pSrc;
-        if(ch <= 0x7f) {
+    /* Pre-flight the rest of the string. */
+    while(i < srcLength) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)) {
              reqLength++;
-            ++pSrc;
          } else {
              if(ch >= 0xe0) {
                  if( /* handle U+0000..U+FFFF inline */
                      ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+                    (i+1) < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f &&
+                    (uint8_t)(src[i+1] - 0x80) <= 0x3f
                  ) {
                      reqLength++;
-                    pSrc += 3;
+                    i += 2;
                      continue;
                  }
              } else {
                  if( /* handle U+0000..U+07FF inline */
                      ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+                    i < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f
                  ) {
                      reqLength++;
-                    pSrc += 2;
+                    ++i;
                      continue;
                  }
              }
@@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
                  return NULL;
              } else {
                  /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                  ++numSubstitutions;
                  reqLength+=U16_LENGTH(ch);
              }
diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp

index 52ae7ff9787dbf1bb45aba092d21128f14a444eb..a2c9008abf668d56c9bb29582ae79192a3e8c6ea 100644 (file)
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@@ -847,15 +847,11 @@ U_CDECL_END
  //------------------------------------------------------------------------------
  
  // Chunk size.
-//     Must be less than 42  (256/6), because of byte mapping from UChar indexes to native indexes.
-//     Worst case there are six UTF-8 bytes per UChar.
-//         obsolete 6 byte form fd + 5 trails maps to fffd
-//         obsolete 5 byte form fc + 4 trails maps to fffd
-//         non-shortest 4 byte forms maps to fffd
-//         normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
-//     mapToUChars array size must allow for the worst case, 6.
-//     This could be brought down to 4, by treating fd and fc as pure illegal,
-//     rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
+//     Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
+//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
+//     to two UChars.)
+//     The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
+//     is a three-byte sequence (truncated four-byte sequence).
  //
  enum { UTF8_TEXT_CHUNK_SIZE=32 };
  
@@ -895,7 +891,7 @@ struct UTF8Buf {
                                                       //  Requires two extra slots,
                                                       //    one for a supplementary starting in the last normal position,
                                                       //    and one for an entry for the buffer limit position.
-    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
+    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
                                                       //   correspoding offset in filled part of buf.
      int32_t   align;
  };
diff --git a/icu4c/source/common/utf_impl.cpp b/icu4c/source/common/utf_impl.cpp

index 856072cb767fab7ef2dffab97877de476c511f67..b560a4f7aa40146ab4e3edea9e0ae8cfd470caea 100644 (file)
--- a/icu4c/source/common/utf_impl.cpp
+++ b/icu4c/source/common/utf_impl.cpp
@@ -7,7 +7,7 @@
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
-*   file name:  utf_impl.c
+*   file name:  utf_impl.cpp
  *   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
@@ -54,10 +54,6 @@
   * - SUB AX, BX (result)
   * -finish:
   * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
   */
  extern "C" U_EXPORT const uint8_t
  utf8_countTrailBytes[256]={
@@ -76,27 +72,24 @@ utf8_countTrailBytes[256]={
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // illegal C0 & C1
+    // 2-byte lead bytes C2..DF
+    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  
+    // 3-byte lead bytes E0..EF
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3,
-    3, 3, 3,    /* illegal in Unicode */
-    4, 4, 4, 4, /* illegal in Unicode */
-    5, 5,       /* illegal in Unicode */
-    0, 0        /* illegal bytes 0xfe and 0xff */
+    // 4-byte lead bytes F0..F4
+    // illegal F5..FF
+    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };
  
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
  static const UChar32
  utf8_errorValue[6]={
      // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
      // but without relying on the obsolete unicode/utf_old.h.
      0x15, 0x9f, 0xffff,
-    0x10ffff,
-    0x3ffffff, 0x7fffffff
+    0x10ffff
  };
  
  static UChar32
@@ -136,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
   */
  U_CAPI UChar32 U_EXPORT2
  utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+    // *pi is one after byte c.
      int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
-    if(i+count<=length || length<0) {
-        uint8_t trail;
-
-        U8_MASK_LEAD_BYTE(c, count);
-        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 0:
-            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            break;
-        case 3:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
-            if(c>=0x110 || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 2:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /*
-             * test for a surrogate d800..dfff unless we are lenient:
-             * before the last (c<<6), a surrogate is c=360..37f
-             */
-            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 1:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            if(trail>0x3f) { break; }
-            /* correct sequence - all trail bytes have (b7..b6)==(10) */
-            if(c>=utf8_minLegal[count] &&
-                    /* strict: forbid non-characters like U+fffe */
-                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+    if(i==length || c>0xf4) {
+        // end of string, or not a lead byte
+    } else if(c>=0xf0) {
+        // Test for 4-byte sequences first because
+        // U8_NEXT() handles shorter valid sequences inline.
+        uint8_t t1=s[i], t2, t3;
+        c&=7;
+        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+            ++i;
+            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+            // strict: forbid non-characters like U+fffe
+            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
                  *pi=i;
                  return c;
              }
-        /* no default branch to optimize switch()  - all values are covered */
          }
-    } else {
-        /* too few bytes left */
-        count=length-i;
-    }
+    } else if(c>=0xe0) {
+        c&=0xf;
+        if(strict!=-2) {
+            uint8_t t1=s[i], t2;
+            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                ++i;
+                c=(c<<12)|((t1&0x3f)<<6)|t2;
+                // strict: forbid non-characters like U+fffe
+                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                    *pi=i;
+                    return c;
+                }
+            }
+        } else {
+            // strict=-2 -> lenient: allow surrogates
+            uint8_t t1=s[i]-0x80, t2;
+            if(t1<=0x3f && (c>0 || t1>=0x20) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                *pi=i+1;
+                return (c<<12)|(t1<<6)|t2;
+            }
+        }
+    } else if(c>=0xc2) {
+        uint8_t t1=s[i]-0x80;
+        if(t1<=0x3f) {
+            *pi=i+1;
+            return ((c-0xc0)<<6)|t1;
+        }
+    }  // else 0x80<=c<0xc2 is not a lead byte
  
      /* error handling */
-    i=*pi;
-    while(count>0 && U8_IS_TRAIL(s[i])) {
-        ++i;
-        --count;
-    }
      c=errorValue(i-*pi, strict);
      *pi=i;
      return c;
@@ -243,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
  
  U_CAPI UChar32 U_EXPORT2
  utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+    // *pi is the index of byte c.
      int32_t i=*pi;
-    uint8_t b, count=1, shift=6;
-
-    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
-    /* extract value bits from the last trail byte */
-    c&=0x3f;
-
-    for(;;) {
-        if(i<=start) {
-            /* no lead byte at all */
-            return errorValue(0, strict);
-        }
-
-        /* read another previous byte */
-        b=s[--i];
-        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-            if(b&0x40) {
-                /* lead byte, this will always end the loop */
-                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
-                if(count==shouldCount) {
-                    /* set the new position */
-                    *pi=i;
-                    U8_MASK_LEAD_BYTE(b, count);
-                    c|=(UChar32)b<<shift;
-                    if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
-                        /* illegal sequence or (strict and non-character) */
-                        if(count>=4) {
-                            count=3;
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(0xc2<=b1 && b1<0xe0) {
+            *pi=i;
+            return ((b1-0xc0)<<6)|(c&0x3f);
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            // Extract the value bits from the last trail byte.
+            c&=0x3f;
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<0xf0) {
+                b2&=0xf;
+                if(strict!=-2) {
+                    if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                        *pi=i;
+                        c=(b2<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(2, strict);
                          }
-                        c=errorValue(count, strict);
-                    } else {
-                        /* exit with correct c */
                      }
                  } else {
-                    /* the lead byte does not match the number of trail bytes */
-                    /* only set the position to the lead byte if it would
-                       include the trail byte that we started with */
-                    if(count<shouldCount) {
+                    // strict=-2 -> lenient: allow surrogates
+                    b1-=0x80;
+                    if((b2>0 || b1>=0x20)) {
+                        *pi=i;
+                        return (b2<<12)|(b1<<6)|c;
+                    }
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    b3&=7;
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
                          *pi=i;
-                        c=errorValue(count, strict);
-                    } else {
-                        c=errorValue(0, strict);
+                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(3, strict);
+                        }
                      }
                  }
-                break;
-            } else if(count<5) {
-                /* trail byte */
-                c|=(UChar32)(b&0x3f)<<shift;
-                ++count;
-                shift+=6;
-            } else {
-                /* more than 5 trail bytes is illegal */
-                c=errorValue(0, strict);
-                break;
+            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                // Truncated 4-byte sequence.
+                *pi=i;
+                return errorValue(2, strict);
              }
-        } else {
-            /* single-byte character precedes trailing bytes */
-            c=errorValue(0, strict);
-            break;
+        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+            // Truncated 3- or 4-byte sequence.
+            *pi=i;
+            return errorValue(1, strict);
          }
      }
-    return c;
+    return errorValue(0, strict);
  }
  
  U_CAPI int32_t U_EXPORT2
  utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
-    /* i had been decremented once before the function call */
-    int32_t I=i, Z;
-    uint8_t b;
-
-    /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I-5>start) {
-        Z=I-5;
-    } else {
-        Z=start;
-    }
-
-    /* return I if the sequence starting there is long enough to include i */
-    do {
-        b=s[I];
-        if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
-            break;
-        } else if(b>=0xc0) {
-            if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
-                return I;
-            } else {
-                break;
+    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+    int32_t orig_i=i;
+    uint8_t c=s[i];
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(0xc2<=b1 && b1<0xe0) {
+            return i;
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<0xf0) {
+                if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                    return i;
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                        return i;
+                    }
+                }
+            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                // Truncated 4-byte sequence.
+                return i;
              }
+        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+            // Truncated 3- or 4-byte sequence.
+            return i;
          }
-    } while(Z<=--I);
-
-    /* return i itself to be consistent with the FWD_1 macro */
-    return i;
+    }
+    return orig_i;
  }
diff --git a/icu4c/source/common/utrie2.h b/icu4c/source/common/utrie2.h

index b33e3305f93b1bd07569bd9130df964d56f20b50..8e1caa5e90bde2a5fff8a6bcbac75dc0b1b86bba 100644 (file)
--- a/icu4c/source/common/utrie2.h
+++ b/icu4c/source/common/utrie2.h
@@ -20,6 +20,7 @@
  #define __UTRIE2_H__
  
  #include "unicode/utypes.h"
+#include "unicode/utf8.h"
  #include "putilimp.h"
  #include "udataswp.h"
  
@@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
   *   is truncated, omitting both the BMP portion and the high range.
   * - There is a special small index for 2-byte UTF-8, and the initial data
   *   entries are designed for fast 1/2-byte UTF-8 lookup.
+ *   Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
+ *   and the associated 2-byte indexes are unused.
   */
  
  /**
@@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
  /** Internal UTF-8 next-post-increment: get the next code point's data. */
  #define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
      uint8_t __lead=(uint8_t)*(src)++; \
-    if(__lead<0xc0) { \
+    if(U8_IS_SINGLE(__lead)) { \
          (result)=(trie)->ascii[__lead]; \
      } else { \
          uint8_t __t1, __t2; \
-        if( /* handle U+0000..U+07FF inline */ \
-            __lead<0xe0 && (src)<(limit) && \
-            (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
-        ) { \
-            ++(src); \
-            (result)=(trie)->data[ \
-                (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
-                __t1]; \
-        } else if( /* handle U+0000..U+CFFF inline */ \
-            __lead<0xed && ((src)+1)<(limit) && \
-            (__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
+        if( /* handle U+0800..U+FFFF inline */ \
+            0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
+            U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
              (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
          ) { \
              (src)+=2; \
              (result)=(trie)->data[ \
                  ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
-                                         (__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
+                                         ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
                  <<UTRIE2_INDEX_SHIFT)+ \
                  (__t2&UTRIE2_DATA_MASK)]; \
+        } else if( /* handle U+0080..U+07FF inline */ \
+            __lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
+            (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
+        ) { \
+            ++(src); \
+            (result)=(trie)->data[ \
+                (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
+                __t1]; \
          } else { \
              int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
                                                                         (const uint8_t *)(limit)); \
@@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
  /** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
  #define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
      uint8_t __b=(uint8_t)*--(src); \
-    if(__b<0x80) { \
+    if(U8_IS_SINGLE(__b)) { \
          (result)=(trie)->ascii[__b]; \
      } else { \
          int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
diff --git a/icu4c/source/i18n/utf8collationiterator.cpp b/icu4c/source/i18n/utf8collationiterator.cpp

index 85d4b76b08e00bd0babd4473cc18cd84f6b10788..345b1994ef0e77f1d9ffaab13dcb0a1bdfbd59a8 100644 (file)
--- a/icu4c/source/i18n/utf8collationiterator.cpp
+++ b/icu4c/source/i18n/utf8collationiterator.cpp
@@ -49,26 +49,25 @@ UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
      }
      // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
      c = u8[pos++];
-    if(c < 0xc0) {
-        // ASCII 00..7F; trail bytes 80..BF map to error values.
+    if(U8_IS_SINGLE(c)) {
+        // ASCII 00..7F
          return trie->data32[c];
      }
      uint8_t t1, t2;
-    if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
-        // U+0080..U+07FF; 00..7F map to error values.
+    if(0xe0 <= c && c < 0xf0 &&
+            ((pos + 1) < length || length < 0) &&
+            U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+            (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+        // U+0800..U+FFFF except surrogates
+        c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
+        pos += 2;
+        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
+    } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+        // U+0080..U+07FF
          uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
          c = ((c & 0x1f) << 6) | t1;
          ++pos;
          return ce32;
-    } else if(c <= 0xef &&
-              ((pos + 1) < length || length < 0) &&
-              (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
-              (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
-    ) {
-        // U+0800..U+FFFF; caller maps surrogates to error values.
-        c = (UChar)((c << 12) | (t1 << 6) | t2);
-        pos += 2;
-        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
      } else {
          // Function call for supplementary code points and error cases.
          // Illegal byte sequences yield U+FFFD.
@@ -158,28 +157,17 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
                  return Collation::FALLBACK_CE32;
              }
              c = u8[pos++];
-            if(c < 0xc0) {
-                // ASCII 00..7F; trail bytes 80..BF map to error values.
+            if(U8_IS_SINGLE(c)) {
+                // ASCII 00..7F
                  return trie->data32[c];
              }
              uint8_t t1, t2;
-            if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
-                // U+0080..U+07FF; 00..7F map to error values.
-                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
-                c = ((c & 0x1f) << 6) | t1;
-                ++pos;
-                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
-                    pos -= 2;
-                } else {
-                    return ce32;
-                }
-            } else if(c <= 0xef &&
-                      ((pos + 1) < length || length < 0) &&
-                      (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
-                      (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
-            ) {
-                // U+0800..U+FFFF; caller maps surrogates to error values.
-                c = (UChar)((c << 12) | (t1 << 6) | t2);
+            if(0xe0 <= c && c < 0xf0 &&
+                    ((pos + 1) < length || length < 0) &&
+                    U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+                    (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+                // U+0800..U+FFFF except surrogates
+                c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
                  pos += 2;
                  if(CollationFCD::hasTccc(c) &&
                          (CollationFCD::maybeTibetanCompositeVowel(c) ||
@@ -188,6 +176,16 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
                  } else {
                      break;  // return CE32(BMP)
                  }
+            } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+                // U+0080..U+07FF
+                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
+                c = ((c & 0x1f) << 6) | t1;
+                ++pos;
+                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
+                    pos -= 2;
+                } else {
+                    return ce32;
+                }
              } else {
                  // Function call for supplementary code points and error cases.
                  // Illegal byte sequences yield U+FFFD.
@@ -237,7 +235,7 @@ UBool
  FCDUTF8CollationIterator::previousHasTccc() const {
      U_ASSERT(state == CHECK_BWD && pos != 0);
      UChar32 c = u8[pos - 1];
-    if(c < 0x80) { return FALSE; }
+    if(U8_IS_SINGLE(c)) { return FALSE; }
      int32_t i = pos;
      U8_PREV_OR_FFFD(u8, 0, i, c);
      if(c > 0xffff) { c = U16_LEAD(c); }
@@ -271,7 +269,7 @@ FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
              if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
                  return U_SENTINEL;
              }
-            if(c < 0x80) {
+            if(U8_IS_SINGLE(c)) {
                  ++pos;
                  return c;
              }
@@ -309,7 +307,7 @@ FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
              if(pos == 0) {
                  return U_SENTINEL;
              }
-            if((c = u8[pos - 1]) < 0x80) {
+            if(U8_IS_SINGLE(c = u8[pos - 1])) {
                  --pos;
                  return c;
              }
diff --git a/icu4c/source/test/cintltst/custrtrn.c b/icu4c/source/test/cintltst/custrtrn.c

index bf1068e3482b52c4d73d8a62d39d7bc03af47fbf..087da834ce819e01787b358bdf49fd77b173e159 100644 (file)
--- a/icu4c/source/test/cintltst/custrtrn.c
+++ b/icu4c/source/test/cintltst/custrtrn.c
@@ -670,12 +670,13 @@ static void Test_UChar_UTF8_API(void){
      }
  
      /* test UTF-8 with single surrogates - illegal in Unicode 3.2 */
+    // Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors.
      {
          static const UChar
              withLead16[]={ 0x1800, 0xd89a, 0x0061 },
              withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
-            withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
-            withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
+            withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
+            withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
          static const uint8_t
              withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
              withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
@@ -706,7 +707,7 @@ static void Test_UChar_UTF8_API(void){
                               &err);
          if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
                               0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
-                             numSubstitutions!=1) {
+                             numSubstitutions!=3) {
              log_err("error: u_strFromUTF8WithSub(length) failed\n");
          }
  
@@ -721,7 +722,7 @@ static void Test_UChar_UTF8_API(void){
                               &err);
          if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
                               0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
-                             numSubstitutions!=1) {
+                             numSubstitutions!=3) {
              log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
          }
  
@@ -734,7 +735,7 @@ static void Test_UChar_UTF8_API(void){
                               (const char *)withTrail8, -1,
                               0x50005, &numSubstitutions,
                               &err);
-        if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
+        if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) {
              log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
          }
  
@@ -1015,14 +1016,6 @@ Test_FromUTF8Lenient(void) {
          log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
      }
  
-    dest[0]=0x1234;
-    destLength=-1;
-    errorCode=U_MEMORY_ALLOCATION_ERROR;
-    pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
-    if(dest[0]!=0x1234) {
-        log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
-    }
-
      /* test normal behavior */
      number=0; /* string number for log_err() */
  
diff --git a/icu4c/source/test/cintltst/trie2test.c b/icu4c/source/test/cintltst/trie2test.c

index 5d11733e85088f91b35f326ad1131a0e9e0894ac..9444159bedab7d35c180af79cbb809f319efe13e 100644 (file)
--- a/icu4c/source/test/cintltst/trie2test.c
+++ b/icu4c/source/test/cintltst/trie2test.c
@@ -350,6 +350,11 @@ static void
  testTrieUTF8(const char *testName,
               const UTrie2 *trie, UTrie2ValueBits valueBits,
               const CheckRange checkRanges[], int32_t countCheckRanges) {
+    // Note: The byte sequence comments refer to the original UTF-8 definition.
+    // Starting with ICU 60, any sequence that is not a prefix of a valid one
+    // is treated as multiple single-byte errors.
+    // For testing, we only rely on U8_... and UTrie2 UTF-8 macros
+    // iterating consistently.
      static const uint8_t illegal[]={
          0xc0, 0x80,                         /* non-shortest U+0000 */
          0xc1, 0xbf,                         /* non-shortest U+007f */
@@ -394,15 +399,36 @@ testTrieUTF8(const char *testName,
          value=checkRanges[i].value;
          /* write three legal (or surrogate) code points */
          U8_APPEND_UNSAFE(s, length, prevCP);    /* start of the range */
-        values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
          c=checkRanges[i].limit;
          prevCP=(prevCP+c)/2;                    /* middle of the range */
          U8_APPEND_UNSAFE(s, length, prevCP);
-        values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
          prevCP=c;
          --c;                                    /* end of the range */
          U8_APPEND_UNSAFE(s, length, c);
-        values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
          /* write an illegal byte sequence */
          if(i8<sizeof(illegal)) {
              U8_FWD_1(illegal, i8, sizeof(illegal));
@@ -435,17 +461,20 @@ testTrieUTF8(const char *testName,
          }
          bytes=0;
          if(value!=values[i] || i8!=(p-s)) {
-            while(prev8<i8) {
-                bytes=(bytes<<8)|s[prev8++];
+            int32_t k=prev8;
+            while(k<i8) {
+                bytes=(bytes<<8)|s[k++];
              }
          }
          if(value!=values[i]) {
-            log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
-                    testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+            log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+                    "0x%lx instead of 0x%lx\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8),
+                    (long)value, (long)values[i]);
          }
          if(i8!=(p-s)) {
-            log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n",
-                    testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+            log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
              continue;
          }
          ++i;
@@ -471,12 +500,14 @@ testTrieUTF8(const char *testName,
              }
          }
          if(value!=values[i]) {
-            log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
-                    testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+            log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+                    ": 0x%lx instead of 0x%lx\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)),
+                    (long)value, (long)values[i]);
          }
          if(i8!=(p-s)) {
-            log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n",
-                    testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+            log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
              continue;
          }
      }
diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c

index bbc67f4bb7305c6fbe5901f28763ca26e8d10ba5..0bbb5e5413dc76e516338f98d4f977149c80109f 100644 (file)
--- a/icu4c/source/test/cintltst/utf8tst.c
+++ b/icu4c/source/test/cintltst/utf8tst.c
@@ -121,7 +121,7 @@ addUTF8Test(TestNode** root)
  
  static void TestCodeUnitValues()
  {
-    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
+    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
  
      int16_t i;
      for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
@@ -231,28 +231,31 @@ static void TestGetChar()
          0x10401,          0x10401,                    0x10401 ,
          0x10401,          0x10401,                    0x10401 ,
          0x10401,          0x10401,                    0x10401,
-        0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
          0x65,             0x65,                       0x65,
          0x31,             0x31,                       0x31,
-        0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
-        0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
      };
      uint16_t i=0;
      UChar32 c, expected;
      uint32_t offset=0;
  
      for(offset=0; offset<sizeof(input); offset++) {
-        if (offset < sizeof(input) - 1) {
+        expected = result[i];
+        if (expected >= 0 && offset < sizeof(input) - 1) {
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
              UTF8_GET_CHAR_UNSAFE(input, offset, c);
-            if(c != result[i]){
-                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+            if(c != expected) {
+                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                        offset, expected, c);
  
              }
  #endif
              U8_GET_UNSAFE(input, offset, c);
-            if(c != result[i]){
-                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+            if(c != expected) {
+                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                        offset, expected, c);
  
              }
          }
@@ -285,146 +288,160 @@ static void TestGetChar()
  }
  
  static void TestNextPrevChar() {
-    static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
+    static const uint8_t input[]={
+        0x61,
+        0xf0, 0x90, 0x90, 0x81,
+        0xc0, 0x80,  // non-shortest form
+        0xf3, 0xbe,  // truncated
+        0xc2,  // truncated
+        0x61,
+        0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence
+        0x00
+    };
      static const UChar32 result[]={
-    /*  next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns        prev_safe_s */
-        0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
-        0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
-        0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
+    /*  next_safe_ns        next_safe_s          prev_safe_ns        prev_safe_s */
+        0x0061,             0x0061,              0x0000,             0x0000,
+        0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        0x0000,             0x0000,              0x0061,             0x0061
      };
      static const int32_t movedOffset[]={
-    /*  next_unsafe   next_safe_ns next_safe_s       prev_unsafe   prev_safe_ns      prev_safe_s */
-        1,            1,           1,                15,           15,               15,
-        5,            5,           5,                14,           14 ,              14,
-        3,            3,           3,                9,            13,               13,
-        4,            4,           4,                9,            12,               12,
-        5,            5,           5,                9,            11,               11,
-        7,            7,           7,                10,           10,               10,
-        7,            7,           7,                9,            9,                9,
-        8,            9,           9,                7,            7,                7,
-        9,            9,           9,                7,            7,                7,
-        11,           10,          10,               5,            5,                5,
-        11,           11,          11,               5,            5,                5,
-        12,           12,          12,               1,            1,                1,
-        13,           13,          13,               1,            1,                1,
-        14,           14,          14,               1,            1,                1,
-        14,           15,          15,               1,            1,                1,
-        14,           16,          16,               0,            0,                0,
+    /*  next_safe    prev_safe_s */
+        1,           15,
+        5,           14,
+        3,           13,
+        4,           12,
+        5,           11,
+        6,           10,
+        7,           9,
+        9,           7,
+        9,           7,
+        10,          6,
+        11,          5,
+        12,          1,
+        13,          1,
+        14,          1,
+        15,          1,
+        16,          0,
      };
-    /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
  
      UChar32 c, expected;
-    uint32_t i=0;
+    uint32_t i=0, j=0;
      uint32_t offset=0;
      int32_t setOffset=0;
      for(offset=0; offset<sizeof(input); offset++){
-        expected=result[i+1];
+        expected=result[i];  // next_safe_ns
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+1], setOffset);
-         }
-        if(c != expected){
-            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        setOffset=offset;
+        UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
+        if(c != expected) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  #endif
-         setOffset=offset;
-         U8_NEXT(input, setOffset, sizeof(input), c);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+1], setOffset);
-         }
+        setOffset=offset;
+        U8_NEXT(input, setOffset, sizeof(input), c);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
          if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
-        if(c != expected){
-            log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  
          setOffset=offset;
          U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
-        if(setOffset != movedOffset[i+1]){
+        if(setOffset != movedOffset[j]) {
              log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                offset, movedOffset[i+1], setOffset);
+                offset, movedOffset[j], setOffset);
          }
          if(expected<0) { expected=0xfffd; }
-        if(c != expected){
-            log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+2], setOffset);
-         }
-         if(c != result[i+2]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
-         }
+        setOffset=offset;
+        UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
+        expected=result[i+1];  // next_safe_s
+        if(c != expected) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
  #endif
-         i=i+6;
+        i=i+4;
+        j=j+2;
      }
  
-    i=0;
+    i=j=0;
      for(offset=sizeof(input); offset > 0; --offset){
-        expected=result[i+4];
+        expected=result[i+2];  // prev_safe_ns
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
-         if(setOffset != movedOffset[i+4]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+4], setOffset);
-         }
-        if(c != expected){
-            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        setOffset=offset;
+        UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
+        if(c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  #endif
-         setOffset=offset;
-         U8_PREV(input, 0, setOffset, c);
-         if(setOffset != movedOffset[i+4]){
-             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+4], setOffset);
-         }
+        setOffset=offset;
+        U8_PREV(input, 0, setOffset, c);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
          if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
-        if(c != expected){
-            log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  
          setOffset=offset;
          U8_PREV_OR_FFFD(input, 0, setOffset, c);
-        if(setOffset != movedOffset[i+4]){
+        if(setOffset != movedOffset[j+1]) {
              log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                offset, movedOffset[i+4], setOffset);
+                offset, movedOffset[j+1], setOffset);
          }
          if(expected<0) { expected=0xfffd; }
-        if(c != expected){
-            log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
          }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
-         if(setOffset != movedOffset[i+5]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+5], setOffset);
-         }
-         if(c != result[i+5]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
-         }
+        setOffset=offset;
+        UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
+        expected=result[i+3];  // prev_safe_s
+        if(c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
  #endif
-         i=i+6;
+        i=i+4;
+        j=j+2;
      }
  }
  
@@ -433,11 +450,13 @@ static void TestNulTerminated() {
      static const uint8_t input[]={
          /*  0 */  0x61,
          /*  1 */  0xf0, 0x90, 0x90, 0x81,
-        /*  5 */  0xc0, 0x80,
+        /*  5 */  0xc0,
+        /*  6 */  0x80,
          /*  7 */  0xdf, 0x80,
          /*  9 */  0xc2,
          /* 10 */  0x62,
-        /* 11 */  0xfd, 0xbe,
+        /* 11 */  0xfd,
+        /* 12 */  0xbe,
          /* 13 */  0xe0, 0xa0, 0x80,
          /* 16 */  0xe2, 0x82, 0xac,
          /* 19 */  0xf0, 0x90, 0x90,
@@ -447,14 +466,16 @@ static void TestNulTerminated() {
      static const UChar32 result[]={
          0x61,
          0x10401,
-        U_SENTINEL,
+        U_SENTINEL,  // C0 not a lead byte
+        U_SENTINEL,  // 80
          0x7c0,
-        U_SENTINEL,
+        U_SENTINEL,  // C2
          0x62,
-        U_SENTINEL,
+        U_SENTINEL,  // FD not a lead byte
+        U_SENTINEL,  // BE
          0x800,
          0x20ac,
-        U_SENTINEL,
+        U_SENTINEL,  // truncated F0 90 90
          0
      };
  
@@ -544,6 +565,22 @@ static void TestNextPrevNonCharacters() {
              log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
          }
      }
+#if !U_HIDE_OBSOLETE_UTF_OLD_H
+    for(idx=0; idx<(int32_t)sizeof(nonChars);) {
+        UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+        UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
+        if(ch!=expected) {
+            log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
+        }
+    }
+    for(idx=(int32_t)sizeof(nonChars); idx>0;) {
+        UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
+        UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+        if(ch!=expected) {
+            log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
+        }
+    }
+#endif
  }
  
  static void TestNextPrevCharUnsafe() {
@@ -563,58 +600,83 @@ static void TestNextPrevCharUnsafe() {
      static const UChar32 codePoints[]={
          0x61,
          0x10401,
-        0,
+        -1,
          0x20ac,
          0xa1,
          0x10ffff,
          0
      };
  
-    UChar32 c;
+    UChar32 c, expected;
      int32_t i;
      uint32_t offset;
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
      for(i=0, offset=0; offset<sizeof(input); ++i) {
          UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
-        if(c != codePoints[i]){
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
              log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                    offset, codePoints[i], c);
+                    offset, expected, c);
+        }
+        if(offset==6) {
+            // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
+            // while the new one skips C0 80 together.
+            ++offset;
          }
      }
  #endif
      for(i=0, offset=0; offset<sizeof(input); ++i) {
          U8_NEXT_UNSAFE(input, offset, c);
-        if(c != codePoints[i]){
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
              log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                    offset, codePoints[i], c);
+                    offset, expected, c);
          }
      }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
      for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
-         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
-         if(c != codePoints[i]){
-             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                     offset, codePoints[i], c);
-         }
+        UTF8_PREV_CHAR_UNSAFE(input, offset, c);
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
      }
  #endif
      for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
-         U8_PREV_UNSAFE(input, offset, c);
-         if(c != codePoints[i]){
-             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                     offset, codePoints[i], c);
-         }
+        U8_PREV_UNSAFE(input, offset, c);
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
+            log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
      }
  }
  
  static void TestFwdBack() {
-    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
-    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
-    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
+    static const uint8_t input[]={
+        0x61,
+        0xF0, 0x90, 0x90, 0x81,
+        0xff,
+        0x62,
+        0xc0,
+        0x80,
+        0x7f,
+        0x8f,
+        0xc0,
+        0x63,
+        0x81,
+        0x90,
+        0x90,
+        0xF0,
+        0x00
+    };
+    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
  
-    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
+    static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
      static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
-    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
+    static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};
  
      uint32_t offsafe=0;
  
@@ -707,7 +769,10 @@ static void TestFwdBackUnsafe() {
          0xf4, 0x8f, 0xbf, 0xbf,
          0x00
      };
-    static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
+    // forward unsafe skips only C0
+    static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
+    // backward unsafe skips C0 80 together
+    static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
  
      int32_t offset;
      int32_t i;
@@ -726,17 +791,17 @@ static void TestFwdBackUnsafe() {
          }
      }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-    for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+    for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
          UTF8_BACK_1_UNSAFE(input, offset);
-        if(offset != boundaries[i]){
-            log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+        if(offset != backBoundaries[i]){
+            log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
          }
      }
  #endif
-    for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+    for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
          U8_BACK_1_UNSAFE(input, offset);
-        if(offset != boundaries[i]){
-            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+        if(offset != backBoundaries[i]){
+            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
          }
      }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
@@ -756,21 +821,21 @@ static void TestFwdBackUnsafe() {
          }
      }
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
-    for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
-        int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+    for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+        int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
          offset=UPRV_LENGTHOF(input);
          UTF8_BACK_N_UNSAFE(input, offset, i);
-        if(offset != boundaries[j]) {
-            log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+        if(offset != backBoundaries[j]) {
+            log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
          }
      }
  #endif
-    for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
-        int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+    for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+        int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
          offset=UPRV_LENGTHOF(input);
          U8_BACK_N_UNSAFE(input, offset, i);
-        if(offset != boundaries[j]) {
-            log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+        if(offset != backBoundaries[j]) {
+            log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
          }
      }
  }
@@ -1138,8 +1203,12 @@ TestSurrogates() {
              log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
          }
  
-        if(is!=iu || il!=iu) {
-            log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        // U8_NEXT() skips only the first byte of a surrogate byte sequence.
+        if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
+            log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        }
+        if(il!=iu) {
+            log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
          }
  
          ++k;    /* next code point */
@@ -1175,8 +1244,12 @@ TestSurrogates() {
              log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
          }
  
-        if(is!=iu || il !=iu) {
-            log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        // U8_PREV() skips only the last byte of a surrogate byte sequence.
+        if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
+            log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        }
+        if(il !=iu) {
+            log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
          }
  
          i=iu;   /* go back by one UTF-8 sequence */
diff --git a/icu4c/source/test/intltest/collationtest.cpp b/icu4c/source/test/intltest/collationtest.cpp

index 94d5055f3d9249accda59749433a2533f8cba89a..70895cc56fe94282442adfc9b9b1d7ca2a7598f1 100644 (file)
--- a/icu4c/source/test/intltest/collationtest.cpp
+++ b/icu4c/source/test/intltest/collationtest.cpp
@@ -294,24 +294,22 @@ void CollationTest::TestIllegalUTF8() {
      coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
  
      static const char *strings[] = {
-        // U+FFFD
-        "a\xef\xbf\xbdz",
-        // illegal byte sequences
-        "a\x80z",  // trail byte
-        "a\xc1\x81z",  // non-shortest form
-        "a\xe0\x82\x83z",  // non-shortest form
-        "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
-        "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
-        "a\xf0\x8f\xbf\xbfz",  // non-shortest form
-        "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
+        // string with U+FFFD == illegal byte sequence
+        u8"a\uFFFDz", "a\x80z",  // trail byte
+        u8"a\uFFFD\uFFFDz", "a\xc1\x81z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
      };
  
-    StringPiece fffd(strings[0]);
-    for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
-        StringPiece illegal(strings[i]);
+    for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
+        StringPiece fffd(strings[i]);
+        StringPiece illegal(strings[i + 1]);
          UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
          if(order != UCOL_EQUAL) {
-            errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
+            errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
                    (int)i, order);
          }
      }
diff --git a/icu4c/source/test/intltest/strtest.cpp b/icu4c/source/test/intltest/strtest.cpp

index 9f542e811af5b92a031d24a99bc23e749c878ff4..d8fd7a0042a7158778d0ff55f0c36c0eb62c0384 100644 (file)
--- a/icu4c/source/test/intltest/strtest.cpp
+++ b/icu4c/source/test/intltest/strtest.cpp
@@ -146,7 +146,7 @@ void
  StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
  #if !U_HIDE_OBSOLETE_UTF_OLD_H
      if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0
-            || UTF8_COUNT_TRAIL_BYTES(0xC0) != 1
+            || UTF8_COUNT_TRAIL_BYTES(0xC2) != 1
              || UTF8_COUNT_TRAIL_BYTES(0xE0) != 2
              || UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) {
          errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h.");
@@ -155,7 +155,7 @@ StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
      // Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated)
      //       have completely different implementations.
      if (U8_COUNT_TRAIL_BYTES(0x7F) != 0
-            || U8_COUNT_TRAIL_BYTES(0xC0) != 1
+            || U8_COUNT_TRAIL_BYTES(0xC2) != 1
              || U8_COUNT_TRAIL_BYTES(0xE0) != 2
              || U8_COUNT_TRAIL_BYTES(0xF0) != 3) {
          errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h.");
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp

index a222e2a2905d5c0c6267e9844309f62bcd466938..4b7cb7ae7c7f800c3bb3a3fe5111ebf583cc82b6 100644 (file)
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -1881,9 +1881,9 @@ UnicodeStringTest::TestUTF8() {
          0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
      };
      static const UChar expected_utf16[] = {
-        0x41, 0xfffd,
-        0x61, 0xfffd,
-        0xfffd, 0x5a,
+        0x41, 0xfffd, 0xfffd, 0xfffd,
+        0x61, 0xfffd, 0xfffd, 0xfffd,
+        0xfffd,  0xfffd, 0xfffd, 0xfffd,0x5a,
          0xd900, 0xdc00, 0x7a,
          0xd800, 0xdc00, 0xd840, 0xdc00,
          0xdb40, 0xdc00, 0xdbff, 0xdfff
diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp

index f2298eaa2e9c40c25bc4be2d7ab5c44855293443..2a779b212139a759d0dd7cb9d4a5af6df888a752 100644 (file)
--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@@ -60,7 +60,6 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
      TESTCASE_AUTO(Ticket10562);
      TESTCASE_AUTO(Ticket10983);
      TESTCASE_AUTO(Ticket12130);
-    TESTCASE_AUTO(Ticket12888);
      TESTCASE_AUTO(Ticket13344);
      TESTCASE_AUTO_END;
  }
@@ -951,10 +950,14 @@ void UTextTest::ErrorTest()
          UChar buf[10];
          int n = utext_extract(ut, 0, 9, buf, 10, &status);
          TEST_SUCCESS(status);
-        TEST_ASSERT(n==5);
+        TEST_ASSERT(n==7);
+        TEST_ASSERT(buf[0] == 0x41);
          TEST_ASSERT(buf[1] == 0xfffd);
-        TEST_ASSERT(buf[3] == 0xfffd);
          TEST_ASSERT(buf[2] == 0x42);
+        TEST_ASSERT(buf[3] == 0xfffd);
+        TEST_ASSERT(buf[4] == 0xfffd);
+        TEST_ASSERT(buf[5] == 0xfffd);
+        TEST_ASSERT(buf[6] == 0x43);
          utext_close(ut);
      }
  
@@ -1578,66 +1581,6 @@ void UTextTest::Ticket12130() {
      utext_close(&ut);
  }
  
-// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
-//               six byte utf-8 forms. Original implementation had an assumption that
-//               there would be at most three utf-8 bytes per UTF-16 code unit.
-//               The five and six byte sequences map to a single replacement character.
-
-void UTextTest::Ticket12888() {
-    const char *badString = 
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
-
-    UErrorCode status = U_ZERO_ERROR;
-    LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
-    TEST_SUCCESS(status);
-    for (;;) {
-        UChar32 c = utext_next32(ut.getAlias());
-        if (c == U_SENTINEL) {
-            break;
-        }
-    }
-    int32_t endIdx = utext_getNativeIndex(ut.getAlias());
-    if (endIdx != (int32_t)strlen(badString)) {
-        errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
-        return;
-    }
-
-    for (int32_t prevIndex = endIdx; prevIndex>0;) {
-        UChar32 c = utext_previous32(ut.getAlias());
-        int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
-        if (c != 0xfffd) {
-            errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
-                    __FILE__, __LINE__, 0xfffd, c, currentIndex);
-            break;
-        }
-        if (currentIndex != prevIndex - 6) {
-            errln("%s:%d: wrong index. Expected, actual = %d, %d",
-                    __FILE__, __LINE__, prevIndex - 6, currentIndex);
-            break;
-        }
-        prevIndex = currentIndex;
-    }
-}
-
  // Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
  //              of a supplementary character.
  
diff --git a/icu4c/source/test/intltest/utxttest.h b/icu4c/source/test/intltest/utxttest.h

index c0b3145351c24e2abaa99be4639a65abbc4b351b..4ed39861160fd3074813cd3c41973f5cc8b3d25d 100644 (file)
--- a/icu4c/source/test/intltest/utxttest.h
+++ b/icu4c/source/test/intltest/utxttest.h
@@ -38,7 +38,6 @@ public:
      void Ticket10562();
      void Ticket10983();
      void Ticket12130();
-    void Ticket12888();
      void Ticket13344();
  
  private:
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java

index 879d230ce2d09188b4aaa493f3859fd1d5f8efd3..038477da5e82b307b62709cf4035b0d09cacd218 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java
@@ -16,11 +16,12 @@ import com.ibm.icu.util.OutputInt;
  
  /**
   * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
- * 
+ *
   * Latin-1: Look up bytes.
   * 2-byte characters: Bits organized vertically.
   * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
   */
  public final class BMPSet {
      public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
@@ -34,9 +35,8 @@ public final class BMPSet {
       * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
       * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
       * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
-     * 
-     * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
-     * runtime.
+     *
+     * Bits for 0..FF are unused (0).
       */
      private int[] table7FF;
  
@@ -46,9 +46,8 @@ public final class BMPSet {
       * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
       * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
       * and set.contains(c) must be called.
-     * 
-     * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
-     * validity checking at runtime.
+     *
+     * Bits for 0..7FF are unused (0).
       */
      private int[] bmpBlockBits;
  
@@ -127,7 +126,7 @@ public final class BMPSet {
      /**
       * Span the initial substring for which each character c has spanCondition==contains(c). It must be
       * spanCondition==0 or 1.
-     * 
+     *
       * @param start The start index
       * @param outCount If not null: Receives the number of code points in the span.
       * @return the limit (exclusive end) of the span
@@ -232,7 +231,7 @@ public final class BMPSet {
       * Symmetrical with span().
       * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
       * limit and spanCondition==0 or 1.
-     * 
+     *
       * @return The string index which starts the span (i.e. inclusive).
       */
      public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
@@ -462,10 +461,10 @@ public final class BMPSet {
      /**
       * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
       * points in a certain range.
-     * 
+     *
       * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
       * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
-     * 
+     *
       * @param c
       *            a character in a subrange of MIN_VALUE..MAX_VALUE
       * @param lo
@@ -512,4 +511,3 @@ public final class BMPSet {
          return (0 != (findCodePoint(c, lo, hi) & 1));
      }
  }
-
author	Markus Scherer <markus.icu@gmail.com>
	Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 21 Sep 2017 23:45:08 +0000 (23:45 +0000)
icu4c/source/common/bmpset.cpp		patch \| blob \| history
icu4c/source/common/bmpset.h		patch \| blob \| history
icu4c/source/common/unicode/utf.h		patch \| blob \| history
icu4c/source/common/unicode/utf8.h		patch \| blob \| history
icu4c/source/common/unisetspan.cpp		patch \| blob \| history
icu4c/source/common/ustrtrns.cpp		patch \| blob \| history
icu4c/source/common/utext.cpp		patch \| blob \| history
icu4c/source/common/utf_impl.cpp		patch \| blob \| history
icu4c/source/common/utrie2.h		patch \| blob \| history
icu4c/source/i18n/utf8collationiterator.cpp		patch \| blob \| history
icu4c/source/test/cintltst/custrtrn.c		patch \| blob \| history
icu4c/source/test/cintltst/trie2test.c		patch \| blob \| history
icu4c/source/test/cintltst/utf8tst.c		patch \| blob \| history
icu4c/source/test/intltest/collationtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/strtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/ustrtest.cpp		patch \| blob \| history
icu4c/source/test/intltest/utxttest.cpp		patch \| blob \| history
icu4c/source/test/intltest/utxttest.h		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java		patch \| blob \| history