From ebbc5423eff48423f8f0acd52f9d2df47693ff0f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Sun, 17 Feb 2013 00:49:18 +0000 Subject: [PATCH] ICU-9538 add script metadata properties API X-SVN-Rev: 33255 --- icu4c/source/common/Makefile.in | 4 +- icu4c/source/common/common.vcxproj | 1 + icu4c/source/common/common.vcxproj.filters | 3 + icu4c/source/common/unicode/uscript.h | 109 ++++++++- icu4c/source/common/uscript_props.cpp | 267 +++++++++++++++++++++ icu4c/source/test/cintltst/cucdapi.c | 64 ++++- icu4c/source/test/cintltst/cucdapi.h | 3 +- icu4c/source/test/cintltst/cucdtst.c | 3 +- icu4c/source/test/intltest/ucdtest.cpp | 74 +++++- icu4c/source/test/intltest/ucdtest.h | 3 +- 10 files changed, 523 insertions(+), 8 deletions(-) create mode 100644 icu4c/source/common/uscript_props.cpp diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index f8efcf9d094..be6a2334e7a 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -1,6 +1,6 @@ #****************************************************************************** # -# Copyright (C) 1999-2012, International Business Machines +# Copyright (C) 1999-2013, International Business Machines # Corporation and others. All Rights Reserved. # #****************************************************************************** @@ -97,7 +97,7 @@ unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \ chariter.o schriter.o uchriter.o uiter.o \ patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \ -uscript.o usc_impl.o unames.o \ +uscript.o uscript_props.o usc_impl.o unames.o \ utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \ uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o \ rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \ diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 6c8567e8848..05696e08651 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -389,6 +389,7 @@ + diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 492b053300c..ff941aaa6ad 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -406,6 +406,9 @@ properties & sets + + properties & sets + properties & sets diff --git a/icu4c/source/common/unicode/uscript.h b/icu4c/source/common/unicode/uscript.h index b8e05c5c45a..f92ff0badb9 100644 --- a/icu4c/source/common/unicode/uscript.h +++ b/icu4c/source/common/unicode/uscript.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 1997-2012, International Business Machines + * Copyright (C) 1997-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -512,4 +512,111 @@ uscript_getScriptExtensions(UChar32 c, UErrorCode *errorCode); #endif /* U_HIDE_DRAFT_API */ +#ifndef U_HIDE_DRAFT_API + +/** + * Script usage constants. + * See UAX #31 Unicode Identifier and Pattern Syntax. + * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers + * + * @draft ICU 51 + */ +typedef enum UScriptUsage { + /** Not encoded in Unicode. @draft ICU 51 */ + USCRIPT_USAGE_NOT_ENCODED, + /** Unknown script usage. @draft ICU 51 */ + USCRIPT_USAGE_UNKNOWN, + /** Candidate for Exclusion from Identifiers. @draft ICU 51 */ + USCRIPT_USAGE_EXCLUDED, + /** Limited Use script. @draft ICU 51 */ + USCRIPT_USAGE_LIMITED_USE, + /** Aspirational Use script. @draft ICU 51 */ + USCRIPT_USAGE_ASPIRATIONAL, + /** Recommended script. @draft ICU 51 */ + USCRIPT_USAGE_RECOMMENDED +} UScriptUsage; + +/** + * Writes the script sample character string. + * This string normally consists of one code point but might be longer. + * The string is empty if the script is not encoded. + * + * @param script script code + * @param dest output string array + * @param capacity number of UChars in the dest array + * @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input + * @return the string length, even if U_BUFFER_OVERFLOW_ERROR + * @draft ICU 51 + */ +U_DRAFT int32_t U_EXPORT2 +uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN +class UnicodeString; +U_NAMESPACE_END + +/** + * Returns the script sample character string. + * This string normally consists of one code point but might be longer. + * The string is empty if the script is not encoded. + * + * @param script script code + * @return the sample character string + * @draft ICU 51 + */ +U_COMMON_API icu::UnicodeString U_EXPORT2 +uscript_getSampleUnicodeString(UScriptCode script); + +#endif + +/** + * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax. + * Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode. + * + * @param script script code + * @return script usage + * @see UScriptUsage + * @draft ICU 51 + */ +U_DRAFT UScriptUsage U_EXPORT2 +uscript_getUsage(UScriptCode script); + +/** + * Returns TRUE if the script is written right-to-left. + * For example, Arab and Hebr. + * + * @param script script code + * @return TRUE if the script is right-to-left + * @draft ICU 51 + */ +U_DRAFT UBool U_EXPORT2 +uscript_isRightToLeft(UScriptCode script); + +/** + * Returns TRUE if the script allows line breaks between letters (excluding hyphenation). + * Such a script typically requires dictionary-based line breaking. + * For example, Hani and Thai. + * + * @param script script code + * @return TRUE if the script allows line breaks between letters + * @draft ICU 51 + */ +U_DRAFT UBool U_EXPORT2 +uscript_breaksBetweenLetters(UScriptCode script); + +/** + * Returns TRUE if in modern (or most recent) usage of the script case distinctions are customary. + * For example, Latn and Cyrl. + * + * @param script script code + * @return TRUE if the script is cased + * @draft ICU 51 + */ +U_DRAFT UBool U_EXPORT2 +uscript_isCased(UScriptCode script); + +#endif /* U_HIDE_DRAFT_API */ + #endif diff --git a/icu4c/source/common/uscript_props.cpp b/icu4c/source/common/uscript_props.cpp new file mode 100644 index 00000000000..7cdde890f7f --- /dev/null +++ b/icu4c/source/common/uscript_props.cpp @@ -0,0 +1,267 @@ +/* +******************************************************************************* +* Copyright (C) 2013, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: uscript_props.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2013feb16 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/uscript.h" +#include "unicode/utf16.h" +#include "ustr_imp.h" + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +namespace { + +// Script metadata (script properties). +// See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt + +// 0 = NOT_ENCODED, no sample character, default false script properties. +// Bits 20.. 0: sample character + +// Bits 23..21: usage +const int32_t UNKNOWN = 1 << 21; +const int32_t EXCLUSION = 2 << 21; +const int32_t LIMITED_USE = 3 << 21; +const int32_t ASPIRATIONAL = 4 << 21; +const int32_t RECOMMENDED = 5 << 21; + +// Bits 31..24: Single-bit flags +const int32_t RTL = 1 << 24; +const int32_t LB_LETTERS = 1 << 25; +const int32_t CASED = 1 << 26; + +const int32_t SCRIPT_PROPS[] = { + // Begin copy-paste output from + // icu/tools/trunk/unicode/py/parsescriptmetadata.py + 0x0040 | UNKNOWN, // Zyyy + 0x0308 | UNKNOWN, // Zinh + 0x0628 | RECOMMENDED | RTL, // Arab + 0x0531 | RECOMMENDED | CASED, // Armn + 0x0995 | RECOMMENDED, // Beng + 0x3105 | RECOMMENDED | LB_LETTERS, // Bopo + 0x13C4 | LIMITED_USE, // Cher + 0x03E2 | EXCLUSION | CASED, // Copt + 0x042F | RECOMMENDED | CASED, // Cyrl + 0x10414 | EXCLUSION | CASED, // Dsrt + 0x0905 | RECOMMENDED, // Deva + 0x12A0 | RECOMMENDED, // Ethi + 0x10D3 | RECOMMENDED, // Geor + 0x10330 | EXCLUSION, // Goth + 0x03A9 | RECOMMENDED | CASED, // Grek + 0x0A95 | RECOMMENDED, // Gujr + 0x0A15 | RECOMMENDED, // Guru + 0x5B57 | RECOMMENDED | LB_LETTERS, // Hani + 0xAC00 | RECOMMENDED, // Hang + 0x05D0 | RECOMMENDED | RTL, // Hebr + 0x304B | RECOMMENDED | LB_LETTERS, // Hira + 0x0C95 | RECOMMENDED, // Knda + 0x30AB | RECOMMENDED | LB_LETTERS, // Kana + 0x1780 | RECOMMENDED | LB_LETTERS, // Khmr + 0x0EA5 | RECOMMENDED | LB_LETTERS, // Laoo + 0x004C | RECOMMENDED | CASED, // Latn + 0x0D15 | RECOMMENDED, // Mlym + 0x1826 | ASPIRATIONAL, // Mong + 0x1000 | RECOMMENDED | LB_LETTERS, // Mymr + 0x168F | EXCLUSION, // Ogam + 0x10300 | EXCLUSION, // Ital + 0x0B15 | RECOMMENDED, // Orya + 0x16A0 | EXCLUSION, // Runr + 0x0D85 | RECOMMENDED, // Sinh + 0x0710 | LIMITED_USE | RTL, // Syrc + 0x0B95 | RECOMMENDED, // Taml + 0x0C15 | RECOMMENDED, // Telu + 0x078C | RECOMMENDED | RTL, // Thaa + 0x0E17 | RECOMMENDED | LB_LETTERS, // Thai + 0x0F40 | RECOMMENDED, // Tibt + 0x14C0 | ASPIRATIONAL, // Cans + 0xA288 | ASPIRATIONAL | LB_LETTERS, // Yiii + 0x1703 | EXCLUSION, // Tglg + 0x1723 | EXCLUSION, // Hano + 0x1743 | EXCLUSION, // Buhd + 0x1763 | EXCLUSION, // Tagb + 0x2800 | UNKNOWN, // Brai + 0x10800 | EXCLUSION | RTL, // Cprt + 0x1900 | LIMITED_USE, // Limb + 0x10000 | EXCLUSION, // Linb + 0x10480 | EXCLUSION, // Osma + 0x10450 | EXCLUSION, // Shaw + 0x1950 | LIMITED_USE | LB_LETTERS, // Tale + 0x10380 | EXCLUSION, // Ugar + 0, + 0x1A00 | EXCLUSION, // Bugi + 0x2C00 | EXCLUSION | CASED, // Glag + 0x10A00 | EXCLUSION | RTL, // Khar + 0xA800 | LIMITED_USE, // Sylo + 0x1980 | LIMITED_USE | LB_LETTERS, // Talu + 0x2D30 | ASPIRATIONAL, // Tfng + 0x103A0 | EXCLUSION, // Xpeo + 0x1B05 | LIMITED_USE | LB_LETTERS, // Bali + 0x1BC0 | LIMITED_USE, // Batk + 0, + 0x11005 | EXCLUSION, // Brah + 0xAA00 | LIMITED_USE, // Cham + 0, + 0, + 0, + 0, + 0x13153 | EXCLUSION, // Egyp + 0, + 0x5B57 | RECOMMENDED | LB_LETTERS, // Hans + 0x5B57 | RECOMMENDED | LB_LETTERS, // Hant + 0, + 0, + 0, + 0xA984 | LIMITED_USE | LB_LETTERS, // Java + 0xA90A | LIMITED_USE, // Kali + 0, + 0, + 0x1C00 | LIMITED_USE, // Lepc + 0, + 0x0840 | LIMITED_USE | RTL, // Mand + 0, + 0x10980 | EXCLUSION | RTL, // Mero + 0x07CA | LIMITED_USE | RTL, // Nkoo + 0x10C00 | EXCLUSION | RTL, // Orkh + 0, + 0xA840 | EXCLUSION, // Phag + 0x10900 | EXCLUSION | RTL, // Phnx + 0x16F00 | ASPIRATIONAL, // Plrd + 0, + 0, + 0, + 0, + 0, + 0, + 0xA549 | LIMITED_USE, // Vaii + 0, + 0x12000 | EXCLUSION, // Xsux + 0, + 0xFFFF | UNKNOWN, // Zzzz + 0x102A0 | EXCLUSION, // Cari + 0x304B | RECOMMENDED | LB_LETTERS, // Jpan + 0x1A20 | LIMITED_USE | LB_LETTERS, // Lana + 0x10280 | EXCLUSION, // Lyci + 0x10920 | EXCLUSION | RTL, // Lydi + 0x1C5A | LIMITED_USE, // Olck + 0xA930 | EXCLUSION, // Rjng + 0xA882 | LIMITED_USE, // Saur + 0, + 0x1B83 | LIMITED_USE, // Sund + 0, + 0xABC0 | LIMITED_USE, // Mtei + 0x10840 | EXCLUSION | RTL, // Armi + 0x10B00 | EXCLUSION | RTL, // Avst + 0x11103 | LIMITED_USE, // Cakm + 0xAC00 | RECOMMENDED, // Kore + 0x11083 | EXCLUSION, // Kthi + 0, + 0x10B60 | EXCLUSION | RTL, // Phli + 0, + 0, + 0x10B40 | EXCLUSION | RTL, // Prti + 0x0800 | EXCLUSION | RTL, // Samr + 0xAA80 | LIMITED_USE | LB_LETTERS, // Tavt + 0, + 0, + 0xA6A0 | LIMITED_USE, // Bamu + 0xA4D0 | LIMITED_USE, // Lisu + 0, + 0x10A60 | EXCLUSION | RTL, // Sarb + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0x109A0 | EXCLUSION | RTL, // Merc + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0x11183 | EXCLUSION, // Shrd + 0x110D0 | EXCLUSION, // Sora + 0x11680 | EXCLUSION, // Takr + 0, + 0, + 0, + 0, + 0, + // End copy-paste from parsescriptmetadata.py +}; + +int32_t getScriptProps(UScriptCode script) { + if (0 <= script && script < LENGTHOF(SCRIPT_PROPS)) { + return SCRIPT_PROPS[script]; + } else { + return 0; + } +} + +} // namespace + +U_CAPI int32_t U_EXPORT2 +uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) { + if(U_FAILURE(*pErrorCode)) { return 0; } + if(capacity < 0 || (capacity > 0 && dest == NULL)) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + int32_t sampleChar = getScriptProps(script) & 0x1fffff; + int32_t length; + if(sampleChar == 0) { + length = 0; + } else { + length = U16_LENGTH(sampleChar); + if(length <= capacity) { + int32_t i = 0; + U16_APPEND_UNSAFE(dest, i, sampleChar); + } + } + return u_terminateUChars(dest, capacity, length, pErrorCode); +} + +U_COMMON_API icu::UnicodeString U_EXPORT2 +uscript_getSampleUnicodeString(UScriptCode script) { + icu::UnicodeString sample; + int32_t sampleChar = getScriptProps(script) & 0x1fffff; + if(sampleChar != 0) { + sample.append(sampleChar); + } + return sample; +} + +U_CAPI UScriptUsage U_EXPORT2 +uscript_getUsage(UScriptCode script) { + return (UScriptUsage)((getScriptProps(script) >> 21) & 7); +} + +U_CAPI UBool U_EXPORT2 +uscript_isRightToLeft(UScriptCode script) { + return (getScriptProps(script) & RTL) != 0; +} + +U_CAPI UBool U_EXPORT2 +uscript_breaksBetweenLetters(UScriptCode script) { + return (getScriptProps(script) & LB_LETTERS) != 0; +} + +U_CAPI UBool U_EXPORT2 +uscript_isCased(UScriptCode script) { + return (getScriptProps(script) & CASED) != 0; +} diff --git a/icu4c/source/test/cintltst/cucdapi.c b/icu4c/source/test/cintltst/cucdapi.c index ee8d203342d..91a12aa3cd5 100644 --- a/icu4c/source/test/cintltst/cucdapi.c +++ b/icu4c/source/test/cintltst/cucdapi.c @@ -1,5 +1,5 @@ /******************************************************************** - * Copyright (c) 1997-2012, International Business Machines + * Copyright (c) 1997-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************/ @@ -527,6 +527,68 @@ void TestGetScriptExtensions() { } } +void TestScriptMetadataAPI() { + /* API & code coverage. More testing in intltest/ucdtest.cpp. */ + UErrorCode errorCode=U_ZERO_ERROR; + UChar sample[8]; + + if(uscript_getSampleString(USCRIPT_LATIN, sample, LENGTHOF(sample), &errorCode)!=1 || + U_FAILURE(errorCode) || + uscript_getScript(sample[0], &errorCode)!=USCRIPT_LATIN || + sample[1]!=0) { + log_err("uscript_getSampleString(Latn) failed - %s\n", u_errorName(errorCode)); + } + sample[0]=0xfffe; + if(uscript_getSampleString(USCRIPT_LATIN, sample, 0, &errorCode)!=1 || + errorCode!=U_BUFFER_OVERFLOW_ERROR || + sample[0]!=0xfffe) { + log_err("uscript_getSampleString(Latn, capacity=0) failed - %s\n", u_errorName(errorCode)); + } + errorCode=U_ZERO_ERROR; + if(uscript_getSampleString(USCRIPT_INVALID_CODE, sample, LENGTHOF(sample), &errorCode)!=0 || + U_FAILURE(errorCode) || + sample[0]!=0) { + log_err("uscript_getSampleString(invalid) failed - %s\n", u_errorName(errorCode)); + } + sample[0]=0xfffe; + if(uscript_getSampleString(USCRIPT_CODE_LIMIT, sample, 0, &errorCode)!=0 || + errorCode!=U_STRING_NOT_TERMINATED_WARNING || + sample[0]!=0xfffe) { + log_err("uscript_getSampleString(limit, capacity=0) failed - %s\n", u_errorName(errorCode)); + } + + if(uscript_getUsage(USCRIPT_LATIN)!=USCRIPT_USAGE_RECOMMENDED || + uscript_getUsage(USCRIPT_YI)!=USCRIPT_USAGE_ASPIRATIONAL || + uscript_getUsage(USCRIPT_CHEROKEE)!=USCRIPT_USAGE_LIMITED_USE || + uscript_getUsage(USCRIPT_COPTIC)!=USCRIPT_USAGE_EXCLUDED || + uscript_getUsage(USCRIPT_CIRTH)!=USCRIPT_USAGE_NOT_ENCODED || + uscript_getUsage(USCRIPT_INVALID_CODE)!=USCRIPT_USAGE_NOT_ENCODED || + uscript_getUsage(USCRIPT_INVALID_CODE)!=USCRIPT_USAGE_NOT_ENCODED) { + log_err("uscript_getUsage() failed\n"); + } + + if(uscript_isRightToLeft(USCRIPT_LATIN) || + uscript_isRightToLeft(USCRIPT_CIRTH) || + !uscript_isRightToLeft(USCRIPT_ARABIC) || + !uscript_isRightToLeft(USCRIPT_HEBREW)) { + log_err("uscript_isRightToLeft() failed\n"); + } + + if(uscript_breaksBetweenLetters(USCRIPT_LATIN) || + uscript_breaksBetweenLetters(USCRIPT_CIRTH) || + !uscript_breaksBetweenLetters(USCRIPT_HAN) || + !uscript_breaksBetweenLetters(USCRIPT_THAI)) { + log_err("uscript_breaksBetweenLetters() failed\n"); + } + + if(uscript_isCased(USCRIPT_CIRTH) || + uscript_isCased(USCRIPT_HAN) || + !uscript_isCased(USCRIPT_LATIN) || + !uscript_isCased(USCRIPT_GREEK)) { + log_err("uscript_isCased() failed\n"); + } +} + void TestBinaryValues() { /* * Unicode 5.1 explicitly defines binary property value aliases. diff --git a/icu4c/source/test/cintltst/cucdapi.h b/icu4c/source/test/cintltst/cucdapi.h index 8ea981151fe..464c5b9e081 100644 --- a/icu4c/source/test/cintltst/cucdapi.h +++ b/icu4c/source/test/cintltst/cucdapi.h @@ -1,10 +1,11 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 2003-2010, International Business Machines Corporation and + * Copyright (c) 2003-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ void TestUScriptCodeAPI(void); void TestHasScript(void); void TestGetScriptExtensions(void); +void TestScriptMetadataAPI(void); void TestBinaryValues(void); diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c index 04b8e51eb33..b31338dd8fe 100644 --- a/icu4c/source/test/cintltst/cucdtst.c +++ b/icu4c/source/test/cintltst/cucdtst.c @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2012, International Business Machines Corporation and + * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************* @@ -184,6 +184,7 @@ void addUnicodeTest(TestNode** root) addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI"); addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript"); addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions"); + addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI"); addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI"); addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames"); addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); diff --git a/icu4c/source/test/intltest/ucdtest.cpp b/icu4c/source/test/intltest/ucdtest.cpp index c1b5f960c2d..4ae48b88d14 100644 --- a/icu4c/source/test/intltest/ucdtest.cpp +++ b/icu4c/source/test/intltest/ucdtest.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2011, International Business Machines Corporation and + * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -8,6 +8,7 @@ #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/putil.h" +#include "unicode/uscript.h" #include "cstring.h" #include "hash.h" #include "patternprops.h" @@ -59,6 +60,7 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, TESTCASE_AUTO(TestBinaryValues); TESTCASE_AUTO(TestConsistency); TESTCASE_AUTO(TestPatternProperties); + TESTCASE_AUTO(TestScriptMetadata); TESTCASE_AUTO_END; } @@ -426,3 +428,73 @@ UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, } return same; } + +namespace { + +/** + * Maps a special script code to the most common script of its encoded characters. + */ +UScriptCode getCharScript(UScriptCode script) { + switch(script) { + case USCRIPT_SIMPLIFIED_HAN: + case USCRIPT_TRADITIONAL_HAN: + return USCRIPT_HAN; + case USCRIPT_JAPANESE: + return USCRIPT_HIRAGANA; + case USCRIPT_KOREAN: + return USCRIPT_HANGUL; + default: + return script; + } +} + +} // namespace + +void UnicodeTest::TestScriptMetadata() { + IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); + UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); + // So far, sample characters are uppercase. + // Georgian is special. + UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); + for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { + UScriptCode sc = (UScriptCode)sci; + // Run the test with -v to see which script has failures: + // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL + logln(uscript_getShortName(sc)); + UScriptUsage usage = uscript_getUsage(sc); + UnicodeString sample = uscript_getSampleUnicodeString(sc); + UnicodeSet scriptSet; + scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); + if(usage == USCRIPT_USAGE_NOT_ENCODED) { + assertTrue("not encoded, no sample", sample.isEmpty()); + assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); + assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); + assertFalse("not encoded, not cased", uscript_isCased(sc)); + assertTrue("not encoded, no characters", scriptSet.isEmpty()); + } else { + assertFalse("encoded, has a sample character", sample.isEmpty()); + UChar32 firstChar = sample.char32At(0); + UScriptCode charScript = getCharScript(sc); + assertEquals("script(sample(script))", + charScript, uscript_getScript(firstChar, errorCode)); + assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc)); + assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc)); + assertEquals("encoded, has characters", sc == charScript, !scriptSet.isEmpty()); + if(uscript_isRightToLeft(sc)) { + rtl.removeAll(scriptSet); + } + if(uscript_isCased(sc)) { + cased.removeAll(scriptSet); + } + } + } + UnicodeString pattern; + assertEquals("no remaining RTL characters", + UnicodeString("[]"), rtl.toPattern(pattern)); + assertEquals("no remaining cased characters", + UnicodeString("[]"), cased.toPattern(pattern)); + + assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); + assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); + assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); +} diff --git a/icu4c/source/test/intltest/ucdtest.h b/icu4c/source/test/intltest/ucdtest.h index 38c7eca223d..74fdfc1c3f5 100644 --- a/icu4c/source/test/intltest/ucdtest.h +++ b/icu4c/source/test/intltest/ucdtest.h @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2011, International Business Machines Corporation and + * Copyright (c) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -37,6 +37,7 @@ public: void TestBinaryValues(); void TestConsistency(); void TestPatternProperties(); + void TestScriptMetadata(); private: -- 2.40.0