]> granicus.if.org Git - icu/commitdiff
ICU-12639 detect & handle malformed UTF-8, never call the low-level full case mapping...
authorMarkus Scherer <markus.icu@gmail.com>
Tue, 20 Sep 2016 20:32:12 +0000 (20:32 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Tue, 20 Sep 2016 20:32:12 +0000 (20:32 +0000)
X-SVN-Rev: 39295

icu4c/source/common/ucase.cpp
icu4c/source/common/ucasemap.cpp
icu4c/source/common/ustrcase.cpp
icu4c/source/test/intltest/strcase.cpp
icu4c/source/test/intltest/ustrtest.h

index fe4335ea459773e076b6e44df65dee879db141fd..97ded9ee2d15f6750664517cc96ca73d2c18dc55 100644 (file)
@@ -815,8 +815,9 @@ U_CAPI int32_t U_EXPORT2
 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
                   UCaseContextIterator *iter, void *context,
                   const UChar **pString,
-                  const char *locale, int32_t *locCache)
-{
+                  const char *locale, int32_t *locCache) {
+    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
+    U_ASSERT(c >= 0);
     UChar32 result=c;
     uint16_t props=UTRIE2_GET16(&csp->trie, c);
     if(!PROPS_HAS_EXCEPTION(props)) {
@@ -961,6 +962,8 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
                const UChar **pString,
                const char *locale, int32_t *locCache,
                UBool upperNotTitle) {
+    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
+    U_ASSERT(c >= 0);
     UChar32 result=c;
     uint16_t props=UTRIE2_GET16(&csp->trie, c);
     if(!PROPS_HAS_EXCEPTION(props)) {
@@ -1169,8 +1172,9 @@ ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
 U_CAPI int32_t U_EXPORT2
 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
                     const UChar **pString,
-                    uint32_t options)
-{
+                    uint32_t options) {
+    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
+    U_ASSERT(c >= 0);
     UChar32 result=c;
     uint16_t props=UTRIE2_GET16(&csp->trie, c);
     if(!PROPS_HAS_EXCEPTION(props)) {
index e8807dd9a56e69e9ddf215250dcfc5ca9be56e29..c0d56c28731d1f4825bcb06ebe250dcba53294a7 100644 (file)
@@ -206,6 +206,21 @@ appendUChar(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
     return limit;
 }
 
+static inline int32_t
+appendString(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
+             const uint8_t *s, int32_t length) {
+    if(length>0) {
+        if(length>(INT32_MAX-destIndex)) {
+            return -1;  // integer overflow
+        }
+        if((destIndex+length)<=destCapacity) {
+            uprv_memcpy(dest+destIndex, s, length);
+        }
+        destIndex+=length;
+    }
+    return destIndex;
+}
+
 static UChar32 U_CALLCONV
 utf8_caseContextIterator(void *context, int8_t dir) {
     UCaseContext *csc=(UCaseContext *)context;
@@ -263,9 +278,11 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
         U8_NEXT(src, srcIndex, srcLimit, c);
         csc->cpLimit=srcIndex;
         if(c<0) {
-            int32_t i=csc->cpStart;
-            while(destIndex<destCapacity && i<srcIndex) {
-                dest[destIndex++]=src[i++];
+            // Malformed UTF-8.
+            destIndex=appendString(dest, destIndex, destCapacity, src+csc->cpStart, srcIndex-csc->cpStart);
+            if(destIndex<0) {
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
             }
             continue;
         }
@@ -297,7 +314,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
          UErrorCode *pErrorCode) {
     const UChar *s;
     UChar32 c;
-    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
+    int32_t prev, titleStart, titleLimit, idx, destIndex;
     UBool isFirstIndex;
 
     if(U_FAILURE(*pErrorCode)) {
@@ -363,21 +380,24 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
                         break; /* cased letter at [titleStart..titleLimit[ */
                     }
                 }
-                length=titleStart-prev;
-                if(length>0) {
-                    if((destIndex+length)<=destCapacity) {
-                        uprv_memcpy(dest+destIndex, src+prev, length);
-                    }
-                    destIndex+=length;
+                destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
+                if(destIndex<0) {
+                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                    return 0;
                 }
             }
 
             if(titleStart<titleLimit) {
                 /* titlecase c which is from [titleStart..titleLimit[ */
-                csc.cpStart=titleStart;
-                csc.cpLimit=titleLimit;
-                c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
-                destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                if(c>=0) {
+                    csc.cpStart=titleStart;
+                    csc.cpLimit=titleLimit;
+                    c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
+                    destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                } else {
+                    // Malformed UTF-8.
+                    destIndex=appendString(dest, destIndex, destCapacity, src+titleStart, titleLimit-titleStart);
+                }
                 if(destIndex<0) {
                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                     return 0;
@@ -407,15 +427,11 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
                         }
                     } else {
                         /* Optionally just copy the rest of the word unchanged. */
-                        length=idx-titleLimit;
-                        if(length>(INT32_MAX-destIndex)) {
+                        destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
+                        if(destIndex<0) {
                             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                             return 0;
                         }
-                        if((destIndex+length)<=destCapacity) {
-                            uprv_memcpy(dest+destIndex, src+titleLimit, length);
-                        }
-                        destIndex+=length;
                     }
                 }
             }
@@ -547,7 +563,7 @@ int32_t toUpper(const UCaseMap *csm,
                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                 return 0;
             }
-        } else {
+        } else if(c>=0) {
             const UChar *s;
             UChar32 c2 = 0;
             c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
@@ -561,6 +577,13 @@ int32_t toUpper(const UCaseMap *csm,
                     return 0;
                 }
             }
+        } else {
+            // Malformed UTF-8.
+            destIndex=appendString(dest, destIndex, destCapacity, src+i, nextIndex-i);
+            if(destIndex<0) {
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
         }
         i = nextIndex;
         state = nextState;
@@ -627,8 +650,11 @@ utf8_foldCase(const UCaseProps *csp,
         start=srcIndex;
         U8_NEXT(src, srcIndex, srcLength, c);
         if(c<0) {
-            while(destIndex<destCapacity && start<srcIndex) {
-                dest[destIndex++]=src[start++];
+            // Malformed UTF-8.
+            destIndex=appendString(dest, destIndex, destCapacity, src+start, srcIndex-start);
+            if(destIndex<0) {
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
             }
             continue;
         }
index 0c184e487377d096c6ea043cf7316adcc3034abb..aee35361cb79ac0acab9b14fe1d69564757ace78 100644 (file)
@@ -98,6 +98,21 @@ appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
     return destIndex+1;
 }
 
+static inline int32_t
+appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
+             const UChar *s, int32_t length) {
+    if(length>0) {
+        if(length>(INT32_MAX-destIndex)) {
+            return -1;  // integer overflow
+        }
+        if((destIndex+length)<=destCapacity) {
+            u_memcpy(dest+destIndex, s, length);
+        }
+        destIndex+=length;
+    }
+    return destIndex;
+}
+
 static UChar32 U_CALLCONV
 utf16_caseContextIterator(void *context, int8_t dir) {
     UCaseContext *csc=(UCaseContext *)context;
@@ -182,7 +197,7 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                          UErrorCode *pErrorCode) {
     const UChar *s;
     UChar32 c;
-    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
+    int32_t prev, titleStart, titleLimit, idx, destIndex;
     UBool isFirstIndex;
 
     if(U_FAILURE(*pErrorCode)) {
@@ -248,12 +263,10 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                         break; /* cased letter at [titleStart..titleLimit[ */
                     }
                 }
-                length=titleStart-prev;
-                if(length>0) {
-                    if((destIndex+length)<=destCapacity) {
-                        u_memcpy(dest+destIndex, src+prev, length);
-                    }
-                    destIndex+=length;
+                destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
+                if(destIndex<0) {
+                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                    return 0;
                 }
             }
 
@@ -297,15 +310,11 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                         }
                     } else {
                         /* Optionally just copy the rest of the word unchanged. */
-                        length=idx-titleLimit;
-                        if(length>(INT32_MAX-destIndex)) {
+                        destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
+                        if(destIndex<0) {
                             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                             return 0;
                         }
-                        if((destIndex+length)<=destCapacity) {
-                            u_memcpy(dest+destIndex, src+titleLimit, length);
-                        }
-                        destIndex+=length;
                     }
                 }
             }
index af79f68e49a6f03ab33be9a60a84ecd5c34e3797..e5304d4fcced788d5526c37674c43098f347b9f0 100644 (file)
@@ -48,6 +48,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
     TESTCASE_AUTO(TestFullCaseFoldingIterator);
     TESTCASE_AUTO(TestGreekUpper);
     TESTCASE_AUTO(TestLongUpper);
+    TESTCASE_AUTO(TestMalformedUTF8);
     TESTCASE_AUTO_END;
 }
 
@@ -707,3 +708,44 @@ StringCaseTest::TestLongUpper() {
               errorCode.errorName(), (long)destLength);
     }
 }
+
+void StringCaseTest::TestMalformedUTF8() {
+    // ticket #12639
+    IcuTestErrorCode errorCode(*this, "TestTitleMalformedUTF8");
+    LocalUCaseMapPointer csm(ucasemap_open("en", U_TITLECASE_NO_BREAK_ADJUSTMENT, errorCode));
+    if (errorCode.isFailure()) {
+        errln("ucasemap_open(English) failed - %s", errorCode.errorName());
+        return;
+    }
+    char src[1] = { (char)0x85 };  // malformed UTF-8
+    char dest[3] = { 0, 0, 0 };
+    int32_t destLength = ucasemap_utf8ToTitle(csm.getAlias(), dest, 3, src, 1, errorCode);
+    if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
+        errln("ucasemap_utf8ToTitle(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
+              errorCode.errorName(), (int)destLength, dest[0]);
+    }
+
+    errorCode.reset();
+    dest[0] = 0;
+    destLength = ucasemap_utf8ToLower(csm.getAlias(), dest, 3, src, 1, errorCode);
+    if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
+        errln("ucasemap_utf8ToLower(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
+              errorCode.errorName(), (int)destLength, dest[0]);
+    }
+
+    errorCode.reset();
+    dest[0] = 0;
+    destLength = ucasemap_utf8ToUpper(csm.getAlias(), dest, 3, src, 1, errorCode);
+    if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
+        errln("ucasemap_utf8ToUpper(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
+              errorCode.errorName(), (int)destLength, dest[0]);
+    }
+
+    errorCode.reset();
+    dest[0] = 0;
+    destLength = ucasemap_utf8FoldCase(csm.getAlias(), dest, 3, src, 1, errorCode);
+    if (errorCode.isFailure() || destLength != 1 || dest[0] != src[0]) {
+        errln("ucasemap_utf8FoldCase(\\x85) failed: %s destLength=%d dest[0]=0x%02x",
+              errorCode.errorName(), (int)destLength, dest[0]);
+    }
+}
index 8dfa750ac2218a5cd905804c4d09b0d872b2dfca..ef3f6cff8ac8acfed4c44fae798749362b503363 100644 (file)
@@ -111,6 +111,7 @@ public:
     void TestFullCaseFoldingIterator();
     void TestGreekUpper();
     void TestLongUpper();
+    void TestMalformedUTF8();
 
 private:
     void assertGreekUpper(const char *s, const char *expected);