ICU-13018 uloc_get/setKeywordValue have stricter input conditions; setKeywordValue...

author Peter Edberg <pedberg@unicode.org>

Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)

committer Peter Edberg <pedberg@unicode.org>

Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)
author Peter Edberg <pedberg@unicode.org>
Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)
committer Peter Edberg <pedberg@unicode.org>
Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)
diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp

index 4d98def1977fcc96d51deea8000e78b576f0e6be..4d854bbcca320e185f989b6310e1c22ec732cfe0 100644 (file)
--- a/icu4c/source/common/uloc.cpp
+++ b/icu4c/source/common/uloc.cpp
@@ -44,10 +44,11 @@
  #include "uarrsort.h"
  #include "uenumimp.h"
  #include "uassert.h"
+#include "charstr.h"
  
  #include <stdio.h> /* for sprintf */
  
-using namespace icu;
+U_NAMESPACE_USE
  
  /* ### Declarations **************************************************/
  
@@ -560,6 +561,10 @@ static int32_t getShortestSubtagLength(const char *localeID) {
  }
  
  /* ### Keywords **************************************************/
+#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
+#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
+/* Punctuation/symbols allowed in legacy key values */
+#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
  
  #define ULOC_KEYWORD_BUFFER_LEN 25
  #define ULOC_MAX_NO_KEYWORDS 25
@@ -596,20 +601,26 @@ locale_getKeywordsStart(const char *localeID) {
   */
  static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
  {
-  int32_t i;
-  int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
+  int32_t keywordNameLen = 0;
  
-  if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
-    /* keyword name too long for internal buffer */
-    *status = U_INTERNAL_PROGRAM_ERROR;
-          return 0;
+  for (; *keywordName != 0; keywordName++) {
+    if (!UPRV_ISALPHANUM(*keywordName)) {
+      *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
+      return 0;
+    }
+    if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
+      buf[keywordNameLen++] = uprv_tolower(*keywordName);
+    } else {
+      /* keyword name too long for internal buffer */
+      *status = U_INTERNAL_PROGRAM_ERROR;
+      return 0;
+    }
    }
-
-  /* normalize the keyword name */
-  for(i = 0; i < keywordNameLen; i++) {
-    buf[i] = uprv_tolower(keywordName[i]);
+  if (keywordNameLen == 0) {
+    *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
+    return 0;
    }
-  buf[i] = 0;
+  buf[keywordNameLen] = 0; /* terminate */
  
    return keywordNameLen;
  }
@@ -837,87 +848,108 @@ uloc_getKeywordValue(const char* localeID,
      const char* nextSeparator = NULL;
      char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
      char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
-    int32_t i = 0;
      int32_t result = 0;
  
      if(status && U_SUCCESS(*status) && localeID) {
        char tempBuffer[ULOC_FULLNAME_CAPACITY];
        const char* tmpLocaleID;
  
+      if (keywordName == NULL || keywordName[0] == 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+      }
+
+      locale_canonKeywordName(keywordNameBuffer, keywordName, status);
+      if(U_FAILURE(*status)) {
+        return 0;
+      }
+
        if (_hasBCP47Extension(localeID)) {
            _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
        } else {
            tmpLocaleID=localeID;
        }
  
-      startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
+      startSearchHere = locale_getKeywordsStart(tmpLocaleID);
        if(startSearchHere == NULL) {
            /* no keywords, return at once */
            return 0;
        }
  
-      locale_canonKeywordName(keywordNameBuffer, keywordName, status);
-      if(U_FAILURE(*status)) {
-        return 0;
-      }
-
        /* find the first keyword */
        while(startSearchHere) {
-          startSearchHere++;
-          /* skip leading spaces (allowed?) */
+          const char* keyValueTail;
+          int32_t keyValueLen;
+
+          startSearchHere++; /* skip @ or ; */
+          nextSeparator = uprv_strchr(startSearchHere, '=');
+          if(!nextSeparator) {
+              *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
+              return 0;
+          }
+          /* strip leading & trailing spaces (TC decided to tolerate these) */
            while(*startSearchHere == ' ') {
                startSearchHere++;
            }
-          nextSeparator = uprv_strchr(startSearchHere, '=');
-          /* need to normalize both keyword and keyword name */
-          if(!nextSeparator) {
-              break;
+          keyValueTail = nextSeparator;
+          while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
+              keyValueTail--;
            }
-          if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
+          /* now keyValueTail points to first char after the keyName */
+          /* copy & normalize keyName from locale */
+          if (startSearchHere == keyValueTail) {
+              *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
+              return 0;
+          }
+          keyValueLen = 0;
+          while (startSearchHere < keyValueTail) {
+            if (!UPRV_ISALPHANUM(*startSearchHere)) {
+              *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
+              return 0;
+            }
+            if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
+              localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
+            } else {
                /* keyword name too long for internal buffer */
                *status = U_INTERNAL_PROGRAM_ERROR;
                return 0;
+            }
            }
-          for(i = 0; i < nextSeparator - startSearchHere; i++) {
-              localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
-          }
-          /* trim trailing spaces */
-          while(startSearchHere[i-1] == ' ') {
-              i--;
-              U_ASSERT(i>=0);
-          }
-          localeKeywordNameBuffer[i] = 0;
+          localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
  
            startSearchHere = uprv_strchr(nextSeparator, ';');
  
            if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
-              nextSeparator++;
+               /* current entry matches the keyword. */
+             nextSeparator++; /* skip '=' */
+              /* First strip leading & trailing spaces (TC decided to tolerate these) */
                while(*nextSeparator == ' ') {
-                  nextSeparator++;
+                nextSeparator++;
                }
-              /* we actually found the keyword. Copy the value */
-              if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
-                  while(*(startSearchHere-1) == ' ') {
-                      startSearchHere--;
-                  }
-                  uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
-                  result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
-              } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
-                  i = (int32_t)uprv_strlen(nextSeparator);
-                  while(nextSeparator[i - 1] == ' ') {
-                      i--;
-                  }
-                  uprv_strncpy(buffer, nextSeparator, i);
-                  result = u_terminateChars(buffer, bufferCapacity, i, status);
-              } else {
-                  /* give a bigger buffer, please */
-                  *status = U_BUFFER_OVERFLOW_ERROR;
-                  if(startSearchHere) {
-                      result = (int32_t)(startSearchHere - nextSeparator);
-                  } else {
-                      result = (int32_t)uprv_strlen(nextSeparator);
-                  }
+              keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
+              while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
+                keyValueTail--;
                }
+              /* Now copy the value, but check well-formedness */
+              if (nextSeparator == keyValueTail) {
+                *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
+                return 0;
+              }
+              keyValueLen = 0;
+              while (nextSeparator < keyValueTail) {
+                if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
+                  *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
+                  return 0;
+                }
+                if (keyValueLen < bufferCapacity) {
+                  /* Should we lowercase value to return here? Tests expect as-is. */
+                  buffer[keyValueLen++] = *nextSeparator++;
+                } else { /* keep advancing so we return correct length in case of overflow */
+                  keyValueLen++;
+                  nextSeparator++;
+                }
+              }
+              result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
                return result;
            }
        }
@@ -936,46 +968,59 @@ uloc_setKeywordValue(const char* keywordName,
      int32_t keywordValueLen;
      int32_t bufLen;
      int32_t needLen = 0;
-    int32_t foundValueLen;
-    int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
      char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
+    char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
      char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
-    int32_t i = 0;
      int32_t rc;
      char* nextSeparator = NULL;
      char* nextEqualsign = NULL;
      char* startSearchHere = NULL;
      char* keywordStart = NULL;
-    char *insertHere = NULL;
+    CharString updatedKeysAndValues;
+    int32_t updatedKeysAndValuesLen;
+    UBool handledInputKeyAndValue = FALSE;
+    char keyValuePrefix = '@';
+
      if(U_FAILURE(*status)) {
          return -1;
      }
-    if(bufferCapacity>1) {
-        bufLen = (int32_t)uprv_strlen(buffer);
-    } else {
+    if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
          *status = U_ILLEGAL_ARGUMENT_ERROR;
          return 0;
      }
+    bufLen = (int32_t)uprv_strlen(buffer);
      if(bufferCapacity<bufLen) {
          /* The capacity is less than the length?! Is this NULL terminated? */
          *status = U_ILLEGAL_ARGUMENT_ERROR;
          return 0;
      }
-    if(keywordValue && !*keywordValue) {
-        keywordValue = NULL;
-    }
-    if(keywordValue) {
-        keywordValueLen = (int32_t)uprv_strlen(keywordValue);
-    } else {
-        keywordValueLen = 0;
-    }
      keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
      if(U_FAILURE(*status)) {
          return 0;
      }
+
+    keywordValueLen = 0;
+    if(keywordValue) {
+        while (*keywordValue != 0) {
+            if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
+                *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
+                return 0;
+            }
+            if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
+                /* Should we force lowercase in value to set? */
+                keywordValueBuffer[keywordValueLen++] = *keywordValue++;
+            } else {
+                /* keywordValue too long for internal buffer */
+                *status = U_INTERNAL_PROGRAM_ERROR;
+                return 0;
+            }
+        }
+    }
+    keywordValueBuffer[keywordValueLen] = 0; /* terminate */
+
      startSearchHere = (char*)locale_getKeywordsStart(buffer);
      if(startSearchHere == NULL || (startSearchHere[1]==0)) {
-        if(!keywordValue) { /* no keywords = nothing to remove */
+        if(keywordValueLen == 0) { /* no keywords = nothing to remove */
              return bufLen;
          }
  
@@ -990,133 +1035,137 @@ uloc_setKeywordValue(const char* keywordName,
              *status = U_BUFFER_OVERFLOW_ERROR;
              return needLen; /* no change */
          }
-        *startSearchHere = '@';
-        startSearchHere++;
+        *startSearchHere++ = '@';
          uprv_strcpy(startSearchHere, keywordNameBuffer);
          startSearchHere += keywordNameLen;
-        *startSearchHere = '=';
-        startSearchHere++;
-        uprv_strcpy(startSearchHere, keywordValue);
-        startSearchHere+=keywordValueLen;
+        *startSearchHere++ = '=';
+        uprv_strcpy(startSearchHere, keywordValueBuffer);
          return needLen;
      } /* end shortcut - no @ */
  
      keywordStart = startSearchHere;
      /* search for keyword */
      while(keywordStart) {
-        keywordStart++;
-        /* skip leading spaces (allowed?) */
+        const char* keyValueTail;
+        int32_t keyValueLen;
+
+        keywordStart++; /* skip @ or ; */
+        nextEqualsign = uprv_strchr(keywordStart, '=');
+        if (!nextEqualsign) {
+            *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
+            return 0;
+        }
+        /* strip leading & trailing spaces (TC decided to tolerate these) */
          while(*keywordStart == ' ') {
              keywordStart++;
          }
-        nextEqualsign = uprv_strchr(keywordStart, '=');
-        /* need to normalize both keyword and keyword name */
-        if(!nextEqualsign) {
-            break;
+        keyValueTail = nextEqualsign;
+        while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
+            keyValueTail--;
          }
-        if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
-            /* keyword name too long for internal buffer */
-            *status = U_INTERNAL_PROGRAM_ERROR;
+        /* now keyValueTail points to first char after the keyName */
+        /* copy & normalize keyName from locale */
+        if (keywordStart == keyValueTail) {
+            *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
              return 0;
          }
-        for(i = 0; i < nextEqualsign - keywordStart; i++) {
-            localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
-        }
-        /* trim trailing spaces */
-        while(keywordStart[i-1] == ' ') {
-            i--;
+        keyValueLen = 0;
+        while (keywordStart < keyValueTail) {
+            if (!UPRV_ISALPHANUM(*keywordStart)) {
+                *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
+                return 0;
+            }
+            if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
+                localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
+            } else {
+                /* keyword name too long for internal buffer */
+                *status = U_INTERNAL_PROGRAM_ERROR;
+                return 0;
+            }
          }
-        U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
-        localeKeywordNameBuffer[i] = 0;
+        localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
  
          nextSeparator = uprv_strchr(nextEqualsign, ';');
+
+        /* start processing the value part */
+        nextEqualsign++; /* skip '=' */
+        /* First strip leading & trailing spaces (TC decided to tolerate these) */
+        while(*nextEqualsign == ' ') {
+            nextEqualsign++;
+        }
+        keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
+        while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
+            keyValueTail--;
+        }
+        if (nextEqualsign == keyValueTail) {
+            *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
+            return 0;
+        }
+
          rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
          if(rc == 0) {
-            nextEqualsign++;
-            while(*nextEqualsign == ' ') {
-                nextEqualsign++;
-            }
-            /* we actually found the keyword. Change the value */
-            if (nextSeparator) {
-                keywordAtEnd = 0;
-                foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
-            } else {
-                keywordAtEnd = 1;
-                foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
-            }
-            if(keywordValue) { /* adding a value - not removing */
-              if(foundValueLen == keywordValueLen) {
-                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
-                return bufLen; /* no change in size */
-              } else if(foundValueLen > keywordValueLen) {
-                int32_t delta = foundValueLen - keywordValueLen;
-                if(nextSeparator) { /* RH side */
-                  uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
-                }
-                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
-                bufLen -= delta;
-                buffer[bufLen]=0;
-                return bufLen;
-              } else { /* FVL < KVL */
-                int32_t delta = keywordValueLen - foundValueLen;
-                if((bufLen+delta) >= bufferCapacity) {
-                  *status = U_BUFFER_OVERFLOW_ERROR;
-                  return bufLen+delta;
-                }
-                if(nextSeparator) { /* RH side */
-                  uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
-                }
-                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
-                bufLen += delta;
-                buffer[bufLen]=0;
-                return bufLen;
-              }
-            } else { /* removing a keyword */
-              if(keywordAtEnd) {
-                /* zero out the ';' or '@' just before startSearchhere */
-                keywordStart[-1] = 0;
-                return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
-              } else {
-                uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
-                keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
-                return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
-              }
+            /* Current entry matches the input keyword. Update the entry */
+            if(keywordValueLen > 0) { /* updating a value */
+                updatedKeysAndValues.append(keyValuePrefix, *status);
+                keyValuePrefix = ';'; /* for any subsequent key-value pair */
+                updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
+                updatedKeysAndValues.append('=', *status);
+                updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
+            } /* else removing this entry, don't emit anything */
+            handledInputKeyAndValue = TRUE;
+        } else {
+           /* input keyword sorts earlier than current entry, add before current entry */
+            if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
+                /* insert new entry at this location */
+                updatedKeysAndValues.append(keyValuePrefix, *status);
+                keyValuePrefix = ';'; /* for any subsequent key-value pair */
+                updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
+                updatedKeysAndValues.append('=', *status);
+                updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
+                handledInputKeyAndValue = TRUE;
              }
-        } else if(rc<0){ /* end match keyword */
-          /* could insert at this location. */
-          insertHere = keywordStart;
+            /* copy the current entry */
+            updatedKeysAndValues.append(keyValuePrefix, *status);
+            keyValuePrefix = ';'; /* for any subsequent key-value pair */
+            updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
+            updatedKeysAndValues.append('=', *status);
+            updatedKeysAndValues.append(nextEqualsign, keyValueTail-nextEqualsign, *status);
+        }
+        if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
+            /* append new entry at the end, it sorts later than existing entries */
+            updatedKeysAndValues.append(keyValuePrefix, *status);
+            /* skip keyValuePrefix update, no subsequent key-value pair */
+            updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
+            updatedKeysAndValues.append('=', *status);
+            updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
+            handledInputKeyAndValue = TRUE;
          }
          keywordStart = nextSeparator;
      } /* end loop searching */
  
-    if(!keywordValue) {
-      return bufLen; /* removal of non-extant keyword - no change */
-    }
-
-    /* we know there is at least one keyword. */
-    needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
+    /* Any error from updatedKeysAndValues.append above would be internal and not due to
+     * problems with the passed-in locale. So if we did encounter problems with the
+     * passed-in locale above, those errors took precedence and overrode any error
+     * status from updatedKeysAndValues.append, and also caused a return of 0. If there
+     * are errors here they are from updatedKeysAndValues.append; they do cause an
+     * error return but the passed-in locale is unmodified and the original bufLen is
+     * returned.
+     */
+    if (!handledInputKeyAndValue || U_FAILURE(*status)) {
+        /* if input key/value specified removal of a keyword not present in locale, or
+         * there was an error in CharString.append, leave original locale alone. */
+        return bufLen;
+    }
+
+    updatedKeysAndValuesLen = updatedKeysAndValues.length();
+    /* needLen = length of the part before '@' + length of updated key-value part including '@' */
+    needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
      if(needLen >= bufferCapacity) {
          *status = U_BUFFER_OVERFLOW_ERROR;
          return needLen; /* no change */
      }
-
-    if(insertHere) {
-      uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
-      keywordStart = insertHere;
-    } else {
-      keywordStart = buffer+bufLen;
-      *keywordStart = ';';
-      keywordStart++;
-    }
-    uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
-    keywordStart += keywordNameLen;
-    *keywordStart = '=';
-    keywordStart++;
-    uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
-    keywordStart+=keywordValueLen;
-    if(insertHere) {
-      *keywordStart = ';';
-      keywordStart++;
+    if (updatedKeysAndValuesLen > 0) {
+        uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
      }
      buffer[needLen]=0;
      return needLen;
@@ -2543,9 +2592,6 @@ uloc_toUnicodeLocaleType(const char* keyword, const char* value)
      return bcpType;
  }
  
-#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
-#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
-
  static UBool
  isWellFormedLegacyKey(const char* legacyKey)
  {
@@ -2588,11 +2634,10 @@ uloc_toLegacyKey(const char* keyword)
          // Checks if the specified locale key is well-formed with the legacy locale syntax.
          //
          // Note:
-        //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
-        //  However, a key should not contain '=' obviously. For now, all existing
-        //  keys are using ASCII alphabetic letters only. We won't add any new key
-        //  that is not compatible with the BCP 47 syntax. Therefore, we assume
-        //  a valid key consist from [0-9a-zA-Z], no symbols.
+        //  LDML/CLDR provides some definition of keyword syntax in
+        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
+        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
+        //  Keys can only consist of [0-9a-zA-Z].
          if (isWellFormedLegacyKey(keyword)) {
              return keyword;
          }
@@ -2608,12 +2653,11 @@ uloc_toLegacyType(const char* keyword, const char* value)
          // Checks if the specified locale type is well-formed with the legacy locale syntax.
          //
          // Note:
-        //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
-        //  However, a type should not contain '=' obviously. For now, all existing
-        //  types are using ASCII alphabetic letters with a few symbol letters. We won't
-        //  add any new type that is not compatible with the BCP 47 syntax except timezone
-        //  IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
-        //  '-' '_' '/' in the middle.
+        //  LDML/CLDR provides some definition of keyword syntax in
+        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
+        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
+        //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
+        //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
          if (isWellFormedLegacyType(value)) {
              return value;
          }
diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h

index f330aa4d56c94f22174768e2ddf1c02a79372dfa..6f94920d3732f306d1d1b6e793aca097c8dc2b65 100644 (file)
--- a/icu4c/source/common/unicode/uloc.h
+++ b/icu4c/source/common/unicode/uloc.h
@@ -855,10 +855,12 @@ uloc_openKeywords(const char* localeID,
   * Get the value for a keyword. Locale name does not need to be normalized.
   * 
   * @param localeID locale name containing the keyword ("de_DE@currency=EURO;collation=PHONEBOOK")
- * @param keywordName name of the keyword for which we want the value. Case insensitive.
+ * @param keywordName name of the keyword for which we want the value; must not be
+ *  NULL or empty, and must consist only of [A-Za-z0-9]. Case insensitive.
   * @param buffer receiving buffer
   * @param bufferCapacity capacity of receiving buffer
- * @param status containing error code - buffer not big enough.
+ * @param status containing error code: e.g. buffer not big enough or ill-formed localeID
+ *  or keywordName parameters.
   * @return the length of keyword value
   * @stable ICU 2.8
   */
@@ -875,18 +877,26 @@ uloc_getKeywordValue(const char* localeID,
   * For removing all keywords, use uloc_getBaseName().
   *
   * NOTE: Unlike almost every other ICU function which takes a
- * buffer, this function will NOT truncate the output text. If a
- * BUFFER_OVERFLOW_ERROR is received, it means that the original
- * buffer is untouched. This is done to prevent incorrect or possibly
- * even malformed locales from being generated and used.
+ * buffer, this function will NOT truncate the output text, and will
+ * not update the buffer with unterminated text setting a status of
+ * U_STRING_NOT_TERMINATED_WARNING. If a BUFFER_OVERFLOW_ERROR is received,
+ * it means a terminated version of the updated locale ID would not fit
+ * in the buffer, and the original buffer is untouched. This is done to
+ * prevent incorrect or possibly even malformed locales from being generated
+ * and used.
   *
- * @param keywordName name of the keyword to be set. Case insensitive.
+ * @param keywordName name of the keyword to be set; must not be
+ *  NULL or empty, and must consist only of [A-Za-z0-9]. Case insensitive.
   * @param keywordValue value of the keyword to be set. If 0-length or
- *  NULL, will result in the keyword being removed. No error is given if 
- *  that keyword does not exist.
- * @param buffer input buffer containing locale to be modified.
+ *  NULL, will result in the keyword being removed; no error is given if 
+ *  that keyword does not exist. Otherwise, must consist only of
+ *  [A-Za-z0-9] and [/_+-].
+ * @param buffer input buffer containing well-formed locale ID to be
+ *  modified.
   * @param bufferCapacity capacity of receiving buffer
- * @param status containing error code - buffer not big enough.
+ * @param status containing error code: e.g. buffer not big enough
+ *  or ill-formed keywordName or keywordValue parameters, or ill-formed
+ *  locale ID in buffer on input.
   * @return the length needed for the buffer
   * @see uloc_getKeywordValue
   * @stable ICU 3.2
diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c

index 9263dfc1025aba9568da33d58f5ef6e0213be8d2..2e7de580ad644efe30e9acaceadc3e72aff45dd1 100644 (file)
--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@@ -1828,27 +1828,38 @@ static void TestKeywordVariantParsing(void)
      static const struct {
          const char *localeID;
          const char *keyword;
-        const char *expectedValue;
+        const char *expectedValue; /* NULL if failure is expected */
      } testCases[] = {
-        { "de_DE@  C o ll A t i o n   = Phonebook   ", "c o ll a t i o n", "Phonebook" },
+        { "de_DE@  C o ll A t i o n   = Phonebook   ", "c o ll a t i o n", NULL }, /* malformed key name */
          { "de_DE", "collation", ""},
          { "de_DE@collation=PHONEBOOK", "collation", "PHONEBOOK" },
          { "de_DE@currency = euro; CoLLaTion   = PHONEBOOk", "collatiON", "PHONEBOOk" },
      };
      
-    UErrorCode status = U_ZERO_ERROR;
-    
+    UErrorCode status;
      int32_t i = 0;
      int32_t resultLen = 0;
      char buffer[256];
      
      for(i = 0; i < UPRV_LENGTHOF(testCases); i++) {
          *buffer = 0;
+        status = U_ZERO_ERROR;
          resultLen = uloc_getKeywordValue(testCases[i].localeID, testCases[i].keyword, buffer, 256, &status);
          (void)resultLen;    /* Suppress set but not used warning. */
-        if(uprv_strcmp(testCases[i].expectedValue, buffer) != 0) {
-            log_err("Expected to extract \"%s\" from \"%s\" for keyword \"%s\". Got \"%s\" instead\n",
-                testCases[i].expectedValue, testCases[i].localeID, testCases[i].keyword, buffer);
+        if (testCases[i].expectedValue) {
+            /* expect success */
+            if (U_FAILURE(status)) {
+                log_err("Expected to extract \"%s\" from \"%s\" for keyword \"%s\". Instead got status %s\n",
+                    testCases[i].expectedValue, testCases[i].localeID, testCases[i].keyword, u_errorName(status));
+            } else if (uprv_strcmp(testCases[i].expectedValue, buffer) != 0) {
+                log_err("Expected to extract \"%s\" from \"%s\" for keyword \"%s\". Instead got \"%s\"\n",
+                    testCases[i].expectedValue, testCases[i].localeID, testCases[i].keyword, buffer);
+            }
+        } else if (U_SUCCESS(status)) {
+            /* expect failure */
+            log_err("Expected failure but got success from \"%s\" for keyword \"%s\". Got \"%s\"\n",
+                testCases[i].localeID, testCases[i].keyword, buffer);
+            
          }
      }
  }
@@ -1901,7 +1912,40 @@ static const struct {
    /* 4. removal of only item */
    { "de@collation=phonebook", "collation", NULL, "de" },
  #endif
-  { "de@collation=phonebook", "Currency", "CHF", "de@collation=phonebook;currency=CHF" }
+  { "de@collation=phonebook", "Currency", "CHF", "de@collation=phonebook;currency=CHF" },
+  /* cases with legal extra spacing */
+  /*31*/{ "en_US@ calendar = islamic", "calendar", "japanese", "en_US@calendar=japanese" },
+  /*32*/{ "en_US@ calendar = gregorian ; collation = phonebook", "calendar", "japanese", "en_US@calendar=japanese;collation=phonebook" },
+  /*33*/{ "en_US@ calendar = islamic", "currency", "CHF", "en_US@calendar=islamic;currency=CHF" },
+  /*34*/{ "en_US@ currency = CHF", "calendar", "japanese", "en_US@calendar=japanese;currency=CHF" },
+  /* cases in which setKeywordValue expected to fail (implied by NULL for expected); locale need not be canonical */
+  /*35*/{ "en_US@calendar=gregorian;", "calendar", "japanese", NULL },
+  /*36*/{ "en_US@calendar=gregorian;=", "calendar", "japanese", NULL },
+  /*37*/{ "en_US@calendar=gregorian;currency=", "calendar", "japanese", NULL },
+  /*38*/{ "en_US@=", "calendar", "japanese", NULL },
+  /*39*/{ "en_US@=;", "calendar", "japanese", NULL },
+  /*40*/{ "en_US@= ", "calendar", "japanese", NULL },
+  /*41*/{ "en_US@ =", "calendar", "japanese", NULL },
+  /*42*/{ "en_US@ = ", "calendar", "japanese", NULL },
+  /*43*/{ "en_US@=;calendar=gregorian", "calendar", "japanese", NULL },
+  /*44*/{ "en_US@= calen dar = gregorian", "calendar", "japanese", NULL },
+  /*45*/{ "en_US@= calendar = greg orian", "calendar", "japanese", NULL },
+  /*46*/{ "en_US@=;cal...endar=gregorian", "calendar", "japanese", NULL },
+  /*47*/{ "en_US@=;calendar=greg...orian", "calendar", "japanese", NULL },
+  /*48*/{ "en_US@calendar=gregorian", "cale ndar", "japanese", NULL },
+  /*49*/{ "en_US@calendar=gregorian", "calendar", "japa..nese", NULL },
+  /* cases in which getKeywordValue and setKeyword expected to fail (implied by NULL for value and expected) */
+  /*50*/{ "en_US@=", "calendar", NULL, NULL },
+  /*51*/{ "en_US@=;", "calendar", NULL, NULL },
+  /*52*/{ "en_US@= ", "calendar", NULL, NULL },
+  /*53*/{ "en_US@ =", "calendar", NULL, NULL },
+  /*54*/{ "en_US@ = ", "calendar", NULL, NULL },
+  /*55*/{ "en_US@=;calendar=gregorian", "calendar", NULL, NULL },
+  /*56*/{ "en_US@= calen dar = gregorian", "calendar", NULL, NULL },
+  /*57*/{ "en_US@= calendar = greg orian", "calendar", NULL, NULL },
+  /*58*/{ "en_US@=;cal...endar=gregorian", "calendar", NULL, NULL },
+  /*59*/{ "en_US@=;calendar=greg...orian", "calendar", NULL, NULL },
+  /*60*/{ "en_US@calendar=gregorian", "cale ndar", NULL, NULL },
  };
  
  
@@ -1914,31 +1958,59 @@ static void TestKeywordSet(void)
      char cbuffer[1024];
  
      for(i = 0; i < UPRV_LENGTHOF(kwSetTestCases); i++) {
-        UErrorCode status = U_ZERO_ERROR;
-        memset(buffer,'%',1023);
-        strcpy(buffer, kwSetTestCases[i].l);
+      UErrorCode status = U_ZERO_ERROR;
+      memset(buffer,'%',1023);
+      strcpy(buffer, kwSetTestCases[i].l);
  
+      if (kwSetTestCases[i].x != NULL) {
          uloc_canonicalize(kwSetTestCases[i].l, cbuffer, 1023, &status);
          if(strcmp(buffer,cbuffer)) {
            log_verbose("note: [%d] wasn't canonical, should be: '%s' not '%s'. Won't check for canonicity in output.\n", i, cbuffer, buffer);
          }
-          /* sanity check test case results for canonicity */
+        /* sanity check test case results for canonicity */
          uloc_canonicalize(kwSetTestCases[i].x, cbuffer, 1023, &status);
          if(strcmp(kwSetTestCases[i].x,cbuffer)) {
            log_err("%s:%d: ERROR: kwSetTestCases[%d].x = '%s', should be %s (must be canonical)\n", __FILE__, __LINE__, i, kwSetTestCases[i].x, cbuffer);
          }
  
+        status = U_ZERO_ERROR;
          resultLen = uloc_setKeywordValue(kwSetTestCases[i].k, kwSetTestCases[i].v, buffer, 1023, &status);
          if(U_FAILURE(status)) {
-          log_err("Err on test case %d: got error %s\n", i, u_errorName(status));
-          continue;
-        }
-        if(strcmp(buffer,kwSetTestCases[i].x) || ((int32_t)strlen(buffer)!=resultLen)) {
-          log_err("FAIL: #%d: %s + [%s=%s] -> %s (%d) expected %s (%d)\n", i, kwSetTestCases[i].l, kwSetTestCases[i].k,
+          log_err("Err on test case %d for setKeywordValue: got error %s\n", i, u_errorName(status));
+        } else if(strcmp(buffer,kwSetTestCases[i].x) || ((int32_t)strlen(buffer)!=resultLen)) {
+          log_err("FAIL: #%d setKeywordValue: %s + [%s=%s] -> %s (%d) expected %s (%d)\n", i, kwSetTestCases[i].l, kwSetTestCases[i].k,
                    kwSetTestCases[i].v, buffer, resultLen, kwSetTestCases[i].x, strlen(buffer));
          } else {
            log_verbose("pass: #%d: %s + [%s=%s] -> %s\n", i, kwSetTestCases[i].l, kwSetTestCases[i].k, kwSetTestCases[i].v,buffer);
          }
+
+        if (kwSetTestCases[i].v != NULL && kwSetTestCases[i].v[0] != 0) {
+          status = U_ZERO_ERROR;
+          resultLen = uloc_getKeywordValue(kwSetTestCases[i].x, kwSetTestCases[i].k, buffer, 1023, &status);
+          if(U_FAILURE(status)) {
+            log_err("Err on test case %d for getKeywordValue: got error %s\n", i, u_errorName(status));
+          } else if (resultLen != uprv_strlen(kwSetTestCases[i].v) || uprv_strcmp(buffer, kwSetTestCases[i].v) != 0) {
+            log_err("FAIL: #%d getKeywordValue: got %s (%d) expected %s (%d)\n", i, buffer, resultLen,
+                    kwSetTestCases[i].v, uprv_strlen(kwSetTestCases[i].v));
+          }
+        }
+      } else {
+        /* test cases expected to result in error */
+        status = U_ZERO_ERROR;
+        resultLen = uloc_setKeywordValue(kwSetTestCases[i].k, kwSetTestCases[i].v, buffer, 1023, &status);
+        if(U_SUCCESS(status)) {
+          log_err("Err on test case %d for setKeywordValue: expected to fail but succeeded, got %s (%d)\n", i, buffer, resultLen);
+        }
+
+        if (kwSetTestCases[i].v == NULL) {
+          status = U_ZERO_ERROR;
+          strcpy(cbuffer, kwSetTestCases[i].l);
+          resultLen = uloc_getKeywordValue(cbuffer, kwSetTestCases[i].k, buffer, 1023, &status);
+          if(U_SUCCESS(status)) {
+            log_err("Err on test case %d for getKeywordValue: expected to fail but succeeded\n", i);
+          }
+        }
+      }
      }
  }
author	Peter Edberg <pedberg@unicode.org>
	Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)
committer	Peter Edberg <pedberg@unicode.org>
	Tue, 21 Mar 2017 23:08:37 +0000 (23:08 +0000)
icu4c/source/common/uloc.cpp		patch \| blob \| history
icu4c/source/common/unicode/uloc.h		patch \| blob \| history
icu4c/source/test/cintltst/cloctst.c		patch \| blob \| history