]> granicus.if.org Git - icu/commitdiff
ICU-13417 Replace fixed buffers in uloc_tag.cpp with CharString.
authorFredrik Roubert <fredrik@roubert.name>
Thu, 20 Sep 2018 00:52:37 +0000 (17:52 -0700)
committerShane Carr <shane@unicode.org>
Thu, 27 Sep 2018 21:27:41 +0000 (14:27 -0700)
This gets rid of those fixed buffers that caused ICU-13417 to be filed
in the first place, those that prevent handling language tags with very
large amounts of keywords.

A number of fixed buffers will still remain in uloc_tag.cpp (and
elsewhere in the locale handling code) for the time being, but this
change is a necessary first step in cleaning up this code and will
alleviate the most pressing problem encountered by ICU4C users.

An off-by-one error in _getKeywords() caused uloc_canonicalize() to not
write out the final keyword value in case the result would fill up the
buffer exactly, resulting in U_STRING_NOT_TERMINATED_WARNING.

icu4c/source/common/uloc.cpp
icu4c/source/common/uloc_tag.cpp
icu4c/source/test/cintltst/cloctst.c
icu4c/source/test/cintltst/cloctst.h
icu4c/source/test/intltest/loctest.cpp
icu4c/source/test/intltest/loctest.h

index f7073fec31fb5f2dc9136a7287879ad143404a16..81b6e0f68ab88be97d59dcec345e8d612ebd45e4 100644 (file)
@@ -798,7 +798,7 @@ _getKeywords(const char *localeID,
             }
             keywordsLen += keywordList[i].keywordLen + 1;
             if(valuesToo) {
-                if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
+                if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
                 }
                 keywordsLen += keywordList[i].valueLen;
index b0647e97a2ab7623c284339047cd5bb5a540ad42..2d6a9213c3debfd55bc15be10ae30a4b71843bf6 100644 (file)
 #include "unicode/putil.h"
 #include "unicode/uloc.h"
 #include "ustr_imp.h"
+#include "charstr.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "putilimp.h"
 #include "uinvchar.h"
 #include "ulocimp.h"
+#include "uvector.h"
 #include "uassert.h"
 
 
@@ -172,6 +174,46 @@ static const char*
 ultag_getGrandfathered(const ULanguageTag* langtag);
 #endif
 
+namespace {
+
+// Helper class to memory manage CharString objects.
+// Only ever stack-allocated, does not need to inherit UMemory.
+class CharStringPool {
+public:
+    CharStringPool() : status(U_ZERO_ERROR), pool(&deleter, nullptr, status) {}
+    ~CharStringPool() = default;
+
+    CharStringPool(const CharStringPool&) = delete;
+    CharStringPool& operator=(const CharStringPool&) = delete;
+
+    icu::CharString* create() {
+        if (U_FAILURE(status)) {
+            return nullptr;
+        }
+        icu::CharString* const obj = new icu::CharString;
+        if (obj == nullptr) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return nullptr;
+        }
+        pool.addElement(obj, status);
+        if (U_FAILURE(status)) {
+            delete obj;
+            return nullptr;
+        }
+        return obj;
+    }
+
+private:
+    static void U_CALLCONV deleter(void* obj) {
+        delete static_cast<icu::CharString*>(obj);
+    }
+
+    UErrorCode status;
+    icu::UVector pool;
+};
+
+}  // namespace
+
 /*
 * -------------------------------------------------
 *
@@ -900,7 +942,6 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
 
 static int32_t
 _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
-    char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
     char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
     int32_t attrBufLength = 0;
     UEnumeration *keywordEnum = NULL;
@@ -920,22 +961,48 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
         AttributeListEntry *firstAttr = NULL;
         AttributeListEntry *attr;
         char *attrValue;
-        char extBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
-        char *pExtBuf = extBuf;
-        int32_t extBufCapacity = sizeof(extBuf);
+        CharStringPool extBufPool;
         const char *bcpKey=nullptr, *bcpValue=nullptr;
         UErrorCode tmpStatus = U_ZERO_ERROR;
         int32_t keylen;
         UBool isBcpUExt;
 
         while (TRUE) {
+            icu::CharString buf;
             key = uenum_next(keywordEnum, NULL, status);
             if (key == NULL) {
                 break;
             }
-            len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus);
-            /* buf must be null-terminated */
-            if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
+            char* buffer;
+            int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY;
+
+            for (;;) {
+                buffer = buf.getAppendBuffer(
+                        /*minCapacity=*/resultCapacity,
+                        /*desiredCapacityHint=*/resultCapacity,
+                        resultCapacity,
+                        tmpStatus);
+
+                if (U_FAILURE(tmpStatus)) {
+                    break;
+                }
+
+                len = uloc_getKeywordValue(
+                        localeID, key, buffer, resultCapacity, &tmpStatus);
+
+                if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
+                    break;
+                }
+
+                resultCapacity = len;
+                tmpStatus = U_ZERO_ERROR;
+            }
+
+            if (U_FAILURE(tmpStatus)) {
+                if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
+                    *status = U_MEMORY_ALLOCATION_ERROR;
+                    break;
+                }
                 if (strict) {
                     *status = U_ILLEGAL_ARGUMENT_ERROR;
                     break;
@@ -945,6 +1012,11 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
                 continue;
             }
 
+            buf.append(buffer, len, tmpStatus);
+            if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
+                tmpStatus = U_ZERO_ERROR;  // Terminators provided by CharString.
+            }
+
             keylen = (int32_t)uprv_strlen(key);
             isBcpUExt = (keylen > 1);
 
@@ -1007,7 +1079,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
                 }
 
                 /* we've checked buf is null-terminated above */
-                bcpValue = uloc_toUnicodeLocaleType(key, buf);
+                bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
                 if (bcpValue == NULL) {
                     if (strict) {
                         *status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -1015,33 +1087,44 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
                     }
                     continue;
                 }
-                if (bcpValue == buf) {
-                    /* 
+                if (bcpValue == buf.data()) {
+                    /*
                     When uloc_toUnicodeLocaleType(key, buf) returns the
                     input value as is, the value is well-formed, but has
                     no known mapping. This implementation normalizes the
-                    the value to lower case
+                    value to lower case
                     */
+                    icu::CharString* extBuf = extBufPool.create();
+                    if (extBuf == nullptr) {
+                        *status = U_MEMORY_ALLOCATION_ERROR;
+                        break;
+                    }
                     int32_t bcpValueLen = static_cast<int32_t>(uprv_strlen(bcpValue));
-                    if (bcpValueLen < extBufCapacity) {
-                        uprv_strcpy(pExtBuf, bcpValue);
-                        T_CString_toLowerCase(pExtBuf);
+                    int32_t resultCapacity;
+                    char* pExtBuf = extBuf->getAppendBuffer(
+                            /*minCapacity=*/bcpValueLen,
+                            /*desiredCapacityHint=*/bcpValueLen,
+                            resultCapacity,
+                            tmpStatus);
+                    if (U_FAILURE(tmpStatus)) {
+                        *status = tmpStatus;
+                        break;
+                    }
 
-                        bcpValue = pExtBuf;
+                    uprv_strcpy(pExtBuf, bcpValue);
+                    T_CString_toLowerCase(pExtBuf);
 
-                        pExtBuf += (bcpValueLen + 1);
-                        extBufCapacity -= (bcpValueLen + 1);
-                    } else {
-                        if (strict) {
-                            *status = U_ILLEGAL_ARGUMENT_ERROR;
-                            break;
-                        }
-                        continue;
+                    extBuf->append(pExtBuf, bcpValueLen, tmpStatus);
+                    if (U_FAILURE(tmpStatus)) {
+                        *status = tmpStatus;
+                        break;
                     }
+
+                    bcpValue = extBuf->data();
                 }
             } else {
                 if (*key == PRIVATEUSE) {
-                    if (!_isPrivateuseValueSubtags(buf, len)) {
+                    if (!_isPrivateuseValueSubtags(buf.data(), len)) {
                         if (strict) {
                             *status = U_ILLEGAL_ARGUMENT_ERROR;
                             break;
@@ -1049,7 +1132,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
                         continue;
                     }
                 } else {
-                    if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf, len)) {
+                    if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) {
                         if (strict) {
                             *status = U_ILLEGAL_ARGUMENT_ERROR;
                             break;
@@ -1058,20 +1141,17 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
                     }
                 }
                 bcpKey = key;
-                if ((len + 1) < extBufCapacity) {
-                    uprv_memcpy(pExtBuf, buf, len);
-                    bcpValue = pExtBuf;
-
-                    pExtBuf += len;
-
-                    *pExtBuf = 0;
-                    pExtBuf++;
-
-                    extBufCapacity -= (len + 1);
-                } else {
-                    *status = U_ILLEGAL_ARGUMENT_ERROR;
+                icu::CharString* extBuf = extBufPool.create();
+                if (extBuf == nullptr) {
+                    *status = U_MEMORY_ALLOCATION_ERROR;
+                    break;
+                }
+                extBuf->append(buf.data(), len, tmpStatus);
+                if (U_FAILURE(tmpStatus)) {
+                    *status = tmpStatus;
                     break;
                 }
+                bcpValue = extBuf->data();
             }
 
             /* create ExtensionListEntry */
@@ -2337,31 +2417,66 @@ uloc_toLanguageTag(const char* localeID,
                    int32_t langtagCapacity,
                    UBool strict,
                    UErrorCode* status) {
-    /* char canonical[ULOC_FULLNAME_CAPACITY]; */ /* See #6822 */
-    char canonical[256];
-    int32_t reslen = 0;
+    icu::CharString canonical;
+    int32_t reslen;
     UErrorCode tmpStatus = U_ZERO_ERROR;
     UBool hadPosix = FALSE;
     const char* pKeywordStart;
 
     /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "".  See #6835 */
-    canonical[0] = 0;
-    if (uprv_strlen(localeID) > 0) {
-        uloc_canonicalize(localeID, canonical, sizeof(canonical), &tmpStatus);
-        if (tmpStatus != U_ZERO_ERROR) {
+    int32_t resultCapacity = uprv_strlen(localeID);
+    if (resultCapacity > 0) {
+        char* buffer;
+
+        for (;;) {
+            buffer = canonical.getAppendBuffer(
+                    /*minCapacity=*/resultCapacity,
+                    /*desiredCapacityHint=*/resultCapacity,
+                    resultCapacity,
+                    tmpStatus);
+
+            if (U_FAILURE(tmpStatus)) {
+                *status = tmpStatus;
+                return 0;
+            }
+
+            reslen =
+                uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
+
+            if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
+                break;
+            }
+
+            resultCapacity = reslen;
+            tmpStatus = U_ZERO_ERROR;
+        }
+
+        if (U_FAILURE(tmpStatus)) {
             *status = U_ILLEGAL_ARGUMENT_ERROR;
             return 0;
         }
+
+        canonical.append(buffer, reslen, tmpStatus);
+        if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
+            tmpStatus = U_ZERO_ERROR;  // Terminators provided by CharString.
+        }
+
+        if (U_FAILURE(tmpStatus)) {
+            *status = tmpStatus;
+            return 0;
+        }
     }
 
+    reslen = 0;
+
     /* For handling special case - private use only tag */
-    pKeywordStart = locale_getKeywordsStart(canonical);
-    if (pKeywordStart == canonical) {
+    pKeywordStart = locale_getKeywordsStart(canonical.data());
+    if (pKeywordStart == canonical.data()) {
         UEnumeration *kwdEnum;
         int kwdCnt = 0;
         UBool done = FALSE;
 
-        kwdEnum = uloc_openKeywords((const char*)canonical, &tmpStatus);
+        kwdEnum = uloc_openKeywords(canonical.data(), &tmpStatus);
         if (kwdEnum != NULL) {
             kwdCnt = uenum_count(kwdEnum, &tmpStatus);
             if (kwdCnt == 1) {
@@ -2399,12 +2514,12 @@ uloc_toLanguageTag(const char* localeID,
         }
     }
 
-    reslen += _appendLanguageToLanguageTag(canonical, langtag, langtagCapacity, strict, status);
-    reslen += _appendScriptToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status);
-    reslen += _appendRegionToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status);
-    reslen += _appendVariantsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status);
-    reslen += _appendKeywordsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
-    reslen += _appendPrivateuseToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
+    reslen += _appendLanguageToLanguageTag(canonical.data(), langtag, langtagCapacity, strict, status);
+    reslen += _appendScriptToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
+    reslen += _appendRegionToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
+    reslen += _appendVariantsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status);
+    reslen += _appendKeywordsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
+    reslen += _appendPrivateuseToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
 
     return reslen;
 }
index 4454c67274d6d579ef9c26ad83bcf53fab7be082..1d1805196f2f320cd754c8dccb449796420f5353 100644 (file)
@@ -226,6 +226,7 @@ void addLocaleTest(TestNode** root)
     TESTCASE(TestKeywordVariants);
     TESTCASE(TestKeywordVariantParsing);
     TESTCASE(TestCanonicalization);
+    TESTCASE(TestCanonicalizationBuffer);
     TESTCASE(TestKeywordSet);
     TESTCASE(TestKeywordSetError);
     TESTCASE(TestDisplayKeywords);
@@ -2251,6 +2252,42 @@ static void TestCanonicalization(void)
     }
 }
 
+static void TestCanonicalizationBuffer(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    char buffer[256];
+
+    // ULOC_FULLNAME_CAPACITY == 157 (uloc.h)
+    static const char name[] =
+        "zh@x"
+        "=foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-barz"
+    ;
+    static const size_t len = sizeof name - 1;  // Without NUL terminator.
+
+    int32_t reslen = uloc_canonicalize(name, buffer, len, &status);
+
+    if (U_FAILURE(status)) {
+        log_err("FAIL: uloc_canonicalize(%s) => %s, expected !U_FAILURE()\n",
+                name, u_errorName(status));
+        return;
+    }
+
+    if (reslen != len) {
+        log_err("FAIL: uloc_canonicalize(%s) => \"%i\", expected \"%u\"\n",
+                name, reslen, len);
+        return;
+    }
+
+    if (uprv_strncmp(name, buffer, len) != 0) {
+        log_err("FAIL: uloc_canonicalize(%s) => \"%.*s\", expected \"%s\"\n",
+                name, reslen, buffer, name);
+        return;
+    }
+}
+
 static void TestDisplayKeywords(void)
 {
     int32_t i;
index be1896a0c3ffcde0f881257a59c6c4857bb3b18b..a2ce892ec234d1e418e7bb9c556d3e91774e8b4c 100644 (file)
@@ -84,6 +84,7 @@ static  void TestDisplayNames(void);
  static void doTestDisplayNames(const char* inLocale, int32_t compareIndex);
 
  static void TestCanonicalization(void);
+ static void TestCanonicalizationBuffer(void);
 
  static void TestDisplayKeywords(void);
 
index d3fc4e286c09c19ab075a0d6f79b906be0b8662b..e375c0c5a550a66d21a87621e735bf3a57fe4bc1 100644 (file)
@@ -252,6 +252,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
     TESTCASE_AUTO(TestToLanguageTag);
     TESTCASE_AUTO(TestMoveAssign);
     TESTCASE_AUTO(TestMoveCtor);
+    TESTCASE_AUTO(TestBug13417VeryLongLanguageTag);
     TESTCASE_AUTO_END;
 }
 
@@ -3125,3 +3126,23 @@ void LocaleTest::TestMoveCtor() {
     assertEquals("variant", l7.getVariant(), l8.getVariant());
     assertEquals("bogus", l7.isBogus(), l8.isBogus());
 }
+
+void LocaleTest::TestBug13417VeryLongLanguageTag() {
+    IcuTestErrorCode status(*this, "TestBug13417VeryLongLanguageTag()");
+
+    static const char tag[] =
+        "zh-x"
+        "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
+        "-foo-bar-baz-fxx"
+    ;
+
+    Locale l = Locale::forLanguageTag(tag, status);
+    status.errIfFailureAndReset("\"%s\"", tag);
+    assertTrue("!l.isBogus()", !l.isBogus());
+
+    std::string result = l.toLanguageTag<std::string>(status);
+    status.errIfFailureAndReset("\"%s\"", l.getName());
+    assertEquals("equals", tag, result.c_str());
+}
index d165cae8932c0b21d8940f5448534d76d1b74c9d..2a83be51a05e96b999d4c4d40513c231b74e88d6 100644 (file)
@@ -124,6 +124,8 @@ public:
     void TestMoveAssign();
     void TestMoveCtor();
 
+    void TestBug13417VeryLongLanguageTag();
+
 private:
     void _checklocs(const char* label,
                     const char* req,