value;WB;NU;Numeric
value;WB;XX;Other
-defaults;0000..10FFFF;age=NA;bc=L;blk=NB;bmg=<code point>;cf=<code point>;dm=<code point>;dt=None;ea=N;FC_NFKC=<code point>;gc=Cn;GCB=XX;hst=NA;InMC=NA;InSC=Other;jg=No_Joining_Group;jt=U;lb=XX;lc=<slc>;NFC_QC=Y;NFD_QC=Y;NFKC_CF=<code point>;NFKC_QC=Y;NFKD_QC=Y;nt=None;SB=XX;sc=Zzzz;scf=<code point>;scx=Script_Extensions;slc=<code point>;stc=<code point>;suc=<code point>;tc=<stc>;uc=<suc>;WB=XX
+defaults;0000..10FFFF;age=NA;bc=L;blk=NB;bmg=<code point>;cf=<code point>;dm=<code point>;dt=None;ea=N;FC_NFKC=<code point>;gc=Cn;GCB=XX;hst=NA;InMC=NA;InSC=Other;jg=No_Joining_Group;jt=U;lb=XX;lc=<slc>;NFC_QC=Y;NFD_QC=Y;NFKC_CF=<code point>;NFKC_QC=Y;NFKD_QC=Y;nt=None;SB=XX;sc=Zzzz;scf=<code point>;scx=<script>;slc=<code point>;stc=<code point>;suc=<code point>;tc=<stc>;uc=<suc>;WB=XX
block;0000..007F;age=1.1;blk=ASCII;ea=Na;gc=Cc;Gr_Base;lb=AL;sc=Zyyy
# 0000..007F C0 Controls and Basic Latin (Basic Latin)
cp;31E1;na=CJK STROKE HZZZG
cp;31E2;na=CJK STROKE PG
cp;31E3;na=CJK STROKE Q
-cp;31E4..31EF;age=NA;bc=L;ea=N;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz;scx=Script_Extensions
+cp;31E4..31EF;age=NA;bc=L;ea=N;gc=Cn;-Gr_Base;lb=XX;sc=Zzzz;scx=<script>
block;31F0..31FF;age=3.2;Alpha;blk=Katakana_Ext;ea=W;gc=Lo;Gr_Base;IDC;IDS;lb=CJ;SB=LE;sc=Kana;WB=KA;XIDC;XIDS
# 31F0..31FF Katakana Phonetic Extensions
cp;A838;bc=ET;gc=Sc;lb=PO;na=NORTH INDIC RUPEE MARK
# Miscellaneous sign
cp;A839;bc=ET;gc=So;na=NORTH INDIC QUANTITY MARK
-cp;A83A..A83F;age=NA;-Gr_Base;lb=XX;sc=Zzzz;scx=Script_Extensions
+cp;A83A..A83F;age=NA;-Gr_Base;lb=XX;sc=Zzzz;scx=<script>
block;A840..A87F;age=5.0;Alpha;blk=Phags_Pa;gc=Lo;Gr_Base;IDC;IDS;InSC=Consonant;lb=AL;SB=LE;sc=Phag;WB=LE;XIDC;XIDS
# A840..A87F Phags-pa
cp;10100;gc=Po;lb=BA;na=AEGEAN WORD SEPARATOR LINE;nt=None
cp;10101;bc=ON;gc=Po;lb=BA;na=AEGEAN WORD SEPARATOR DOT;nt=None
cp;10102;gc=Po;lb=BA;na=AEGEAN CHECK MARK;nt=None
-cp;10103..10106;age=NA;gc=Cn;-Gr_Base;lb=XX;nt=None;sc=Zzzz;scx=Script_Extensions
+cp;10103..10106;age=NA;gc=Cn;-Gr_Base;lb=XX;nt=None;sc=Zzzz;scx=<script>
# Numbers
cp;10107;na=AEGEAN NUMBER ONE;nv=1
cp;10108;na=AEGEAN NUMBER TWO;nv=2
cp;10131;na=AEGEAN NUMBER SEVENTY THOUSAND;nv=70000
cp;10132;na=AEGEAN NUMBER EIGHTY THOUSAND;nv=80000
cp;10133;na=AEGEAN NUMBER NINETY THOUSAND;nv=90000
-cp;10134..10136;age=NA;gc=Cn;-Gr_Base;lb=XX;nt=None;sc=Zzzz;scx=Script_Extensions
+cp;10134..10136;age=NA;gc=Cn;-Gr_Base;lb=XX;nt=None;sc=Zzzz;scx=<script>
# Measures
cp;10137;gc=So;na=AEGEAN WEIGHT BASE UNIT;nt=None
cp;10138;gc=So;na=AEGEAN WEIGHT FIRST SUBUNIT;nt=None
} else {
props.intProps[prop-UCHAR_INT_START]=value;
}
- } else if(*v=='<' && lineType==DEFAULTS_LINE) {
- // Ignore default values like <code point>.
- return TRUE;
+ } else if(*v=='<') {
+ // Do not parse default values like <code point>, just set null values.
+ switch(prop) {
+ case UCHAR_BIDI_MIRRORING_GLYPH:
+ props.bmg=U_SENTINEL;
+ break;
+ case UCHAR_CASE_FOLDING:
+ props.cf.remove();
+ break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ props.scx.clear();
+ break;
+ case UCHAR_LOWERCASE_MAPPING:
+ case UCHAR_SIMPLE_CASE_FOLDING:
+ case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+ case UCHAR_SIMPLE_TITLECASE_MAPPING:
+ case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+ case UCHAR_TITLECASE_MAPPING:
+ case UCHAR_UPPERCASE_MAPPING:
+ // Ignore unhandled properties.
+ break;
+ default:
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
} else {
char c;
switch(prop) {
case PPUCD_NAME_ALIAS:
props.nameAlias=v;
break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ parseScriptExtensions(v, props.scx, errorCode);
+ break;
default:
// Ignore unhandled properties.
return TRUE;
}
}
+void
+PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ scx.clear();
+ CharString scString;
+ for(;;) {
+ const char *scs;
+ const char *scLimit=strchr(s, ' ');
+ if(scLimit!=NULL) {
+ scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
+ if(U_FAILURE(errorCode)) { return; }
+ } else {
+ scs=s;
+ }
+ int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
+ if(script==UCHAR_INVALID_CODE) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else if(scx.contains(script)) {
+ fprintf(stderr,
+ "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else {
+ scx.add(script);
+ }
+ if(scLimit!=NULL) {
+ s=scLimit+1;
+ } else {
+ break;
+ }
+ }
+ if(scx.isEmpty()) {
+ fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+}
+
U_NAMESPACE_END
const char *name;
const char *nameAlias;
UnicodeString cf;
+ UnicodeSet scx;
};
class U_TOOLUTIL_API PreparsedUCD {
UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
+ void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
static const int32_t kNumLineBuffers=3;