From a2f3849b0d5c833b0905e64e87fb505da2bcfe97 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 17 Aug 2012 19:18:06 +0000 Subject: [PATCH] ICU-9437 update UCD tools for Unicode 6.2, especially new numeric values X-SVN-Rev: 32193 --- tools/unicode/c/genprops/corepropsbuilder.cpp | 57 ++++++++++++++----- tools/unicode/c/genprops/pnames_data.h | 37 ++++++------ 2 files changed, 63 insertions(+), 31 deletions(-) diff --git a/tools/unicode/c/genprops/corepropsbuilder.cpp b/tools/unicode/c/genprops/corepropsbuilder.cpp index 913a61e7f78..20d75c53ccf 100644 --- a/tools/unicode/c/genprops/corepropsbuilder.cpp +++ b/tools/unicode/c/genprops/corepropsbuilder.cpp @@ -45,7 +45,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure precedes the actual data. It contains platform properties values and the file format version. -The following is a description of format version 7 . +The following is a description of format version 7.1 . Data contents: @@ -155,6 +155,8 @@ Encoding of numeric type and value in the 10-bit ntv field: 0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) (only one significant decimal digit) + 0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1) + ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) --- Additional properties (new in format version 2.1) --- @@ -234,6 +236,14 @@ than a script code. Change from UTrie to UTrie2. +--- Changes in format version 7.1 --- + +Unicode 6.2 adds sexagesimal (base-60) numeric values: + cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000 + cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000 + +The encoding of numeric values was extended to handle such values. + ----------------------------------------------------------------------------- */ U_NAMESPACE_USE @@ -249,8 +259,8 @@ static UDataInfo dataInfo={ 0, { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ - { 7, 0, 0, 0 }, /* formatVersion */ - { 6, 0, 0, 0 } /* dataVersion */ + { 7, 1, 0, 0 }, /* formatVersion */ + { 6, 2, 0, 0 } /* dataVersion */ }; class CorePropsBuilder : public PropsBuilder { @@ -301,6 +311,7 @@ CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) { // For nt=U_NT_NUMERIC. static int32_t encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) { + const char *original; /* get a possible minus sign */ UBool isNegative; if(*s=='-') { @@ -322,15 +333,16 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) { } else { /* normal number parsing */ unsigned long ul=uprv_strtoul(s, &numberLimit, 10); - if(ul>0x7fffffff) { + if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) { ntv=-1; } else { value=(int32_t)ul; } - if(s=0 && *numberLimit=='/') { /* fractional value, get the denominator */ - ul=uprv_strtoul(numberLimit+1, &numberLimit, 10); - if(ul==0 || ul>0x7fffffff) { + s=numberLimit+1; + ul=uprv_strtoul(s, &numberLimit, 10); + if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) { ntv=-1; } else { den=(int32_t)ul; @@ -359,22 +371,41 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) { mant/=10; ++exp; } + // Note: value<=0x7fffffff guarantees exp<=33 if(mant<=9) { ntv=((mant+14)<<5)+(exp-2); + } else { + // Try sexagesimal (base 60) numbers. + mant=value; + exp=0; + while((mant%60)==0) { + mant/=60; + ++exp; + } + if(mant<=9 && exp<=4) { + ntv=((mant+0xbf)<<2)+(exp-1); + } else { + ntv=-1; + } } } } else if(2<=exp && exp<=33 && 1<=value && value<=9) { /* large, single-significant-digit integer */ ntv=((value+14)<<5)+(exp-2); + } else { + ntv=-1; } - } else if(exp==0) { - if(-1<=value && value<=17 && 1<=den && den<=16) { - /* fraction */ - ntv=((value+12)<<4)+(den-1); - } + } else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) { + /* fraction */ + ntv=((value+12)<<4)+(den-1); + } else if(exp==0 && value==-1 && den==0) { + /* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */ + ntv=((value+12)<<4); + } else { + ntv=-1; } if(ntv<0 || *numberLimit!=0) { - fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", s); + fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original); errorCode=U_ILLEGAL_ARGUMENT_ERROR; } return ntv; diff --git a/tools/unicode/c/genprops/pnames_data.h b/tools/unicode/c/genprops/pnames_data.h index 0773a223292..24ba09f60c2 100644 --- a/tools/unicode/c/genprops/pnames_data.h +++ b/tools/unicode/c/genprops/pnames_data.h @@ -258,7 +258,7 @@ static const Value VALUES_blk[221] = { Value(UBLOCK_TAKRI, "Takri Takri"), }; -static const Value VALUES_ccc[56] = { +static const Value VALUES_ccc[57] = { Value(0, "NR Not_Reordered"), Value(1, "OV Overlay"), Value(7, "NK Nukta"), @@ -299,7 +299,8 @@ static const Value VALUES_ccc[56] = { Value(122, "CCC122 CCC122"), Value(129, "CCC129 CCC129"), Value(130, "CCC130 CCC130"), - Value(132, "CCC133 CCC133"), + Value(132, "CCC132 CCC132"), + Value(133, "CCC133 CCC133"), Value(200, "ATBL Attached_Below_Left"), Value(202, "ATB Attached_Below"), Value(214, "ATA Attached_Above"), @@ -490,7 +491,7 @@ static const Value VALUES_lb[40] = { Value(U_LB_CLOSE_PARENTHESIS, "CP Close_Parenthesis"), Value(U_LB_CONDITIONAL_JAPANESE_STARTER, "CJ Conditional_Japanese_Starter"), Value(U_LB_HEBREW_LETTER, "HL Hebrew_Letter"), - Value(U_LB_ZERO_WIDTH_JOINER, "ZJ Zero_Width_Joiner"), + Value(U_LB_REGIONAL_INDICATOR, "RI Regional_Indicator"), }; static const Value VALUES_nt[4] = { @@ -693,7 +694,7 @@ static const Value VALUES_NFKC_QC[3] = { Value(UNORM_MAYBE, "M Maybe"), }; -static const Value VALUES_lccc[56] = { +static const Value VALUES_lccc[57] = { Value(0, "NR Not_Reordered"), Value(1, "OV Overlay"), Value(7, "NK Nukta"), @@ -734,7 +735,8 @@ static const Value VALUES_lccc[56] = { Value(122, "CCC122 CCC122"), Value(129, "CCC129 CCC129"), Value(130, "CCC130 CCC130"), - Value(132, "CCC133 CCC133"), + Value(132, "CCC132 CCC132"), + Value(133, "CCC133 CCC133"), Value(200, "ATBL Attached_Below_Left"), Value(202, "ATB Attached_Below"), Value(214, "ATA Attached_Above"), @@ -752,7 +754,7 @@ static const Value VALUES_lccc[56] = { Value(240, "IS Iota_Subscript"), }; -static const Value VALUES_tccc[56] = { +static const Value VALUES_tccc[57] = { Value(0, "NR Not_Reordered"), Value(1, "OV Overlay"), Value(7, "NK Nukta"), @@ -793,7 +795,8 @@ static const Value VALUES_tccc[56] = { Value(122, "CCC122 CCC122"), Value(129, "CCC129 CCC129"), Value(130, "CCC130 CCC130"), - Value(132, "CCC133 CCC133"), + Value(132, "CCC132 CCC132"), + Value(133, "CCC133 CCC133"), Value(200, "ATBL Attached_Below_Left"), Value(202, "ATB Attached_Below"), Value(214, "ATA Attached_Above"), @@ -811,7 +814,7 @@ static const Value VALUES_tccc[56] = { Value(240, "IS Iota_Subscript"), }; -static const Value VALUES_GCB[14] = { +static const Value VALUES_GCB[13] = { Value(U_GCB_OTHER, "XX Other"), Value(U_GCB_CONTROL, "CN Control"), Value(U_GCB_CR, "CR CR"), @@ -824,8 +827,7 @@ static const Value VALUES_GCB[14] = { Value(U_GCB_V, "V V"), Value(U_GCB_SPACING_MARK, "SM SpacingMark"), Value(U_GCB_PREPEND, "PP Prepend"), - Value(U_GCB_AFTER_JOINER, "AJ After_Joiner"), - Value(U_GCB_JOINER, "J Joiner"), + Value(U_GCB_REGIONAL_INDICATOR, "RI Regional_Indicator"), }; static const Value VALUES_SB[15] = { @@ -846,7 +848,7 @@ static const Value VALUES_SB[15] = { Value(U_SB_SCONTINUE, "SC SContinue"), }; -static const Value VALUES_WB[15] = { +static const Value VALUES_WB[14] = { Value(U_WB_OTHER, "XX Other"), Value(U_WB_ALETTER, "LE ALetter"), Value(U_WB_FORMAT, "FO Format"), @@ -860,8 +862,7 @@ static const Value VALUES_WB[15] = { Value(U_WB_LF, "LF LF"), Value(U_WB_MIDNUMLET, "MB MidNumLet"), Value(U_WB_NEWLINE, "NL Newline"), - Value(U_WB_AFTER_JOINER, "AJ After_Joiner"), - Value(U_WB_JOINER, "J Joiner"), + Value(U_WB_REGIONAL_INDICATOR, "RI Regional_Indicator"), }; static const Value VALUES_gcm[38] = { @@ -965,7 +966,7 @@ static const Property PROPERTIES[94] = { Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"), Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 19), Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 221), - Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 56), + Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57), Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18), Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6), Property(UCHAR_GENERAL_CATEGORY, "gc General_Category", VALUES_gc, 30), @@ -979,11 +980,11 @@ static const Property PROPERTIES[94] = { Property(UCHAR_NFKD_QUICK_CHECK, "NFKD_QC NFKD_Quick_Check", VALUES_NFKD_QC, 2), Property(UCHAR_NFC_QUICK_CHECK, "NFC_QC NFC_Quick_Check", VALUES_NFC_QC, 3), Property(UCHAR_NFKC_QUICK_CHECK, "NFKC_QC NFKC_Quick_Check", VALUES_NFKC_QC, 3), - Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 56), - Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 56), - Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 14), + Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 57), + Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 57), + Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 13), Property(UCHAR_SENTENCE_BREAK, "SB Sentence_Break", VALUES_SB, 15), - Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 15), + Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 14), Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38), Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"), Property(UCHAR_AGE, "age Age"), -- 2.40.0