From e353b8e8977c32180073f5becdc0b3200defb5bd Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 28 Aug 2014 23:51:38 +0000 Subject: [PATCH] ICU-10751 uscript_getCode(locale) use likely subtags not LocaleScript locale data X-SVN-Rev: 36280 --- icu4c/source/common/unicode/uscript.h | 9 +- icu4c/source/common/uscript.c | 163 ++++++++++++++++---------- icu4c/source/test/cintltst/cucdapi.c | 76 ++++++++++++ 3 files changed, 183 insertions(+), 65 deletions(-) diff --git a/icu4c/source/common/unicode/uscript.h b/icu4c/source/common/unicode/uscript.h index 82b848663e3..b969f5e2eaa 100644 --- a/icu4c/source/common/unicode/uscript.h +++ b/icu4c/source/common/unicode/uscript.h @@ -423,15 +423,16 @@ typedef enum UScriptCode { } UScriptCode; /** - * Gets script codes associated with the given locale or ISO 15924 abbreviation or name. + * Gets the script codes associated with the given locale or ISO 15924 abbreviation or name. * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym". * Fills in USCRIPT_LATIN given "en" OR "en_US" - * If required capacity is greater than capacity of the destination buffer then the error code - * is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned + * If the required capacity is greater than the capacity of the destination buffer, + * then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned. * *

Note: To search by short or long script alias only, use - * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. This does + * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. That does * a fast lookup with no access of the locale data. + * * @param nameOrAbbrOrLocale name of the script, as given in * PropertyValueAliases.txt, or ISO 15924 code or locale * @param fillIn the UScriptCode buffer to fill in the script code diff --git a/icu4c/source/common/uscript.c b/icu4c/source/common/uscript.c index 6193c80baff..2b1d190606e 100644 --- a/icu4c/source/common/uscript.c +++ b/icu4c/source/common/uscript.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1997-2011, International Business Machines +* Copyright (C) 1997-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -13,85 +13,126 @@ ****************************************************************************** */ -#include "unicode/uscript.h" -#include "unicode/ures.h" #include "unicode/uchar.h" -#include "unicode/putil.h" -#include "uprops.h" +#include "unicode/uscript.h" +#include "unicode/uloc.h" #include "cmemory.h" #include "cstring.h" -static const char kLocaleScript[] = "LocaleScript"; +static const UScriptCode JAPANESE[3] = { USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN }; +static const UScriptCode KOREAN[2] = { USCRIPT_HANGUL, USCRIPT_HAN }; +static const UScriptCode HAN_BOPO[2] = { USCRIPT_HAN, USCRIPT_BOPOMOFO }; + +static int32_t +setCodes(const UScriptCode *src, int32_t length, + UScriptCode *dest, int32_t capacity, UErrorCode *err) { + int32_t i; + if(U_FAILURE(*err)) { return 0; } + if(length > capacity) { + *err = U_BUFFER_OVERFLOW_ERROR; + return length; + } + for(i = 0; i < length; ++i) { + dest[i] = src[i]; + } + return length; +} + +static int32_t +setOneCode(UScriptCode script, UScriptCode *scripts, int32_t capacity, UErrorCode *err) { + if(U_FAILURE(*err)) { return 0; } + if(1 > capacity) { + *err = U_BUFFER_OVERFLOW_ERROR; + return 1; + } + scripts[0] = script; + return 1; +} + +static int32_t +getCodesFromLocale(const char *locale, + UScriptCode *scripts, int32_t capacity, UErrorCode *err) { + UErrorCode internalErrorCode = U_ZERO_ERROR; + char lang[8]; + char script[8]; + int32_t langLength, scriptLength; + if(U_FAILURE(*err)) { return 0; } + // Multi-script languages, equivalent to the LocaleScript data + // that we used to load from locale resource bundles. + langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &internalErrorCode); + if(U_FAILURE(internalErrorCode) || internalErrorCode == U_STRING_NOT_TERMINATED_WARNING) { + return 0; + } + if(0 == uprv_strcmp(lang, "ja")) { + return setCodes(JAPANESE, UPRV_LENGTHOF(JAPANESE), scripts, capacity, err); + } + if(0 == uprv_strcmp(lang, "ko")) { + return setCodes(KOREAN, UPRV_LENGTHOF(KOREAN), scripts, capacity, err); + } + scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &internalErrorCode); + if(U_FAILURE(internalErrorCode) || internalErrorCode == U_STRING_NOT_TERMINATED_WARNING) { + return 0; + } + if(0 == uprv_strcmp(lang, "zh") && 0 == uprv_strcmp(script, "Hant")) { + return setCodes(HAN_BOPO, UPRV_LENGTHOF(HAN_BOPO), scripts, capacity, err); + } + // Explicit script code. + if(scriptLength != 0) { + UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script); + if(scriptCode != USCRIPT_INVALID_CODE) { + if(scriptCode == USCRIPT_SIMPLIFIED_HAN || scriptCode == USCRIPT_TRADITIONAL_HAN) { + scriptCode = USCRIPT_HAN; + } + return setOneCode(scriptCode, scripts, capacity, err); + } + } + return 0; +} -/* TODO: this is a bad API should be deprecated */ +/* TODO: this is a bad API and should be deprecated, ticket #11141 */ U_CAPI int32_t U_EXPORT2 uscript_getCode(const char* nameOrAbbrOrLocale, UScriptCode* fillIn, int32_t capacity, UErrorCode* err){ - - UScriptCode code = USCRIPT_INVALID_CODE; - int32_t numFilled=0; - int32_t len=0; - /* check arguments */ - if(err==NULL ||U_FAILURE(*err)){ - return numFilled; + if(U_FAILURE(*err)) { + return 0; } - if(nameOrAbbrOrLocale==NULL || fillIn == NULL || capacity<0){ + if(nameOrAbbrOrLocale==NULL || + (fillIn == NULL ? capacity != 0 : capacity < 0)) { *err = U_ILLEGAL_ARGUMENT_ERROR; - return numFilled; + return 0; } + UBool triedCode = FALSE; if(uprv_strchr(nameOrAbbrOrLocale, '-')==NULL && uprv_strchr(nameOrAbbrOrLocale, '_')==NULL ){ /* try long and abbreviated script names first */ - code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, nameOrAbbrOrLocale); - - } - if(code==(UScriptCode)UCHAR_INVALID_CODE){ - /* Do not propagate error codes from just not finding a locale bundle. */ - UErrorCode localErrorCode = U_ZERO_ERROR; - UResourceBundle* resB = ures_open(NULL,nameOrAbbrOrLocale,&localErrorCode); - if(U_SUCCESS(localErrorCode)&& localErrorCode != U_USING_DEFAULT_WARNING){ - UResourceBundle* resD = ures_getByKey(resB,kLocaleScript,NULL,&localErrorCode); - if(U_SUCCESS(localErrorCode) ){ - len =0; - while(ures_hasNext(resD)){ - const UChar* name = ures_getNextString(resD,&len,NULL,&localErrorCode); - if(U_SUCCESS(localErrorCode)){ - char cName[50] = {'\0'}; - u_UCharsToChars(name,cName,len); - code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, cName); - /* got the script code now fill in the buffer */ - if(numFilled 0) { + strcat(s, " "); + } + strcat(s, uscript_getShortName(scripts[i])); + } +} + +static void assertEqualScripts(const char *msg, + const UScriptCode scripts1[], int32_t length1, + const UScriptCode scripts2[], int32_t length2, + UErrorCode errorCode) { + char s1[80]; + char s2[80]; + if(U_FAILURE(errorCode)) { + log_err("Failed: %s - %s\n", msg, u_errorName(errorCode)); + return; + } + scriptsToString(scripts1, length1, s1); + scriptsToString(scripts2, length2, s2); + if(0!=strcmp(s1, s2)) { + log_err("Failed: %s: expected %s but got %s\n", msg, s1, s2); + } +} + void TestUScriptCodeAPI(){ int i =0; int numErrors =0; @@ -112,6 +144,50 @@ void TestUScriptCodeAPI(){ } } + { + static const UScriptCode LATIN[1] = { USCRIPT_LATIN }; + static const UScriptCode CYRILLIC[1] = { USCRIPT_CYRILLIC }; + static const UScriptCode DEVANAGARI[1] = { USCRIPT_DEVANAGARI }; + static const UScriptCode HAN[1] = { USCRIPT_HAN }; + static const UScriptCode JAPANESE[3] = { USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN }; + static const UScriptCode KOREAN[2] = { USCRIPT_HANGUL, USCRIPT_HAN }; + static const UScriptCode HAN_BOPO[2] = { USCRIPT_HAN, USCRIPT_BOPOMOFO }; + UScriptCode scripts[5]; + UErrorCode err; + int32_t num; + + // Should work regardless of whether we have locale data for the language. + err = U_ZERO_ERROR; + num = uscript_getCode("tg", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("tg script: Cyrl", CYRILLIC, 1, scripts, num, err); // Tajik + err = U_ZERO_ERROR; + num = uscript_getCode("xsr", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("xsr script: Deva", DEVANAGARI, 1, scripts, num, err); // Sherpa + + // Multi-script languages. + err = U_ZERO_ERROR; + num = uscript_getCode("ja", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("ja scripts: Kana Hira Hani", + JAPANESE, UPRV_LENGTHOF(JAPANESE), scripts, num, err); + err = U_ZERO_ERROR; + num = uscript_getCode("ko", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("ko scripts: Hang Hani", + KOREAN, UPRV_LENGTHOF(KOREAN), scripts, num, err); + err = U_ZERO_ERROR; + num = uscript_getCode("zh", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("zh script: Hani", HAN, 1, scripts, num, err); + err = U_ZERO_ERROR; + num = uscript_getCode("zh-Hant", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("zh-Hant scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err); + err = U_ZERO_ERROR; + num = uscript_getCode("zh-TW", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("zh-TW scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err); + + // Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro). + err = U_ZERO_ERROR; + num = uscript_getCode("ro-RO", scripts, UPRV_LENGTHOF(scripts), &err); + assertEqualScripts("ro-RO script: Latn", LATIN, 1, scripts, num, err); + } { UScriptCode testAbbr[]={ -- 2.40.0