/*
*******************************************************************************
-* Copyright (C) 2012-2014, International Business Machines
+* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationbasedatabuilder.cpp
: CollationDataBuilder(errorCode),
numericPrimary(0x12000000),
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
- rootElements(errorCode) {
+ rootElements(errorCode),
+ scriptStartsLength(1) {
+ uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
+ uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
}
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
}
void
-CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
- const UnicodeString &groupScripts,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- if(groupScripts.isEmpty()) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
+CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
+ // The primary weight must be the lowest possible for a two-byte prefix.
+ // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
+ U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
+ p >>= 8;
+ U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
+ p >>= 8;
+ uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
+ if((p & 0xff) == lowestP2) {
+ // The script really starts on a lead byte boundary. Round down to that.
+ p &= 0xff00;
}
- if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
- // Zzzz must not occur.
- // It is the code used in the API to separate low and high scripts.
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
+ // Script starts should be added in ascending order, otherwise we would need to sort them.
+ if(script < UCOL_REORDER_CODE_FIRST) {
+ U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
+ } else {
+ U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
+ script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
+ }
+ if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
+ // Two scripts share a range (e.g., Hira & Kana).
+ scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
+ } else {
+ U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
+ U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
+ scriptsIndex[script] = (uint16_t)scriptStartsLength;
+ scriptStarts[scriptStartsLength++] = (uint16_t)p;
+ }
+ if(script == USCRIPT_UNKNOWN) {
+ // The last script start is for unassigned code points
+ // (with high implict primary weights).
+ // Add one more entry with the limit of this range,
+ // which is the start of the trailing-weights range.
+ U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
+ scriptStarts[scriptStartsLength++] =
+ (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
}
- // Note: We are mostly trusting the input data,
- // rather than verifying that reordering groups do not intersect
- // with their lead byte ranges nor their sets of scripts,
- // and that all script codes are valid.
- scripts.append((UChar)((firstByte << 8) | lastByte));
- scripts.append((UChar)groupScripts.length());
- scripts.append(groupScripts);
}
void
buildMappings(data, errorCode);
data.numericPrimary = numericPrimary;
data.compressibleBytes = compressibleBytes;
- data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
- data.scriptsLength = scripts.length();
+
+ int32_t numScripts = USCRIPT_CODE_LIMIT;
+ while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
+ // Move the 16 special groups (not all used)
+ // down for contiguous storage of the script and special-group indexes.
+ for(int32_t i = 0; i < 16; ++i) {
+ scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
+ }
+ data.numScripts = numScripts;
+ data.scriptsIndex = scriptsIndex;
+ data.scriptStarts = scriptStarts;
+ data.scriptStartsLength = scriptStartsLength;
buildFastLatinTable(data, errorCode);
}
/*
*******************************************************************************
*
-* Copyright (C) 2000-2014, International Business Machines
+* Copyright (C) 2000-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
+#include "unicode/ucol.h"
+#include "unicode/uscript.h"
#include "unicode/utf8.h"
#include "charstr.h"
#include "cmemory.h"
#include "uparse.h"
#include "writesrc.h"
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
#if UCONFIG_NO_COLLATION
extern "C" int
0,
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
- { 4, 1, 0, 0 }, // formatVersion
+ { 5, 0, 0, 0 }, // formatVersion
{ 6, 3, 0, 0 } // dataVersion
};
}
}
+// Hardcoded mapping from script sample characters to script codes.
+// Pro: Available without complete and updated UCD scripts data,
+// easy to add non-script codes specific to collation.
+// Con: Needs manual update for each new script or change in sample character.
static const struct {
- const char *name;
- int32_t code;
-} specialReorderTokens[] = {
- { "TERMINATOR", -2 }, // -2 means "ignore"
- { "LEVEL-SEPARATOR", -2 },
- { "FIELD-SEPARATOR", -2 },
- { "COMPRESS", -3 },
- // The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
- { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
- { "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
- { "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits).
- { "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes.
+ UChar32 sampleChar;
+ int32_t script;
+} sampleCharsToScripts[] = {
+ { 0x00A0, UCOL_REORDER_CODE_SPACE },
+ { 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
+ { 0x263A, UCOL_REORDER_CODE_SYMBOL },
+ { 0x20AC, UCOL_REORDER_CODE_CURRENCY },
+ { 0x0034, UCOL_REORDER_CODE_DIGIT },
+ { 0x004C, USCRIPT_LATIN },
+ { 0x03A9, USCRIPT_GREEK },
+ { 0x03E2, USCRIPT_COPTIC },
+ { 0x042F, USCRIPT_CYRILLIC },
+ { 0x2C00, USCRIPT_GLAGOLITIC },
+ { 0x1036B, USCRIPT_OLD_PERMIC },
+ { 0x10D3, USCRIPT_GEORGIAN },
+ { 0x0531, USCRIPT_ARMENIAN },
+ { 0x05D0, USCRIPT_HEBREW },
+ { 0x10900, USCRIPT_PHOENICIAN },
+ { 0x0800, USCRIPT_SAMARITAN },
+ { 0x0628, USCRIPT_ARABIC },
+ { 0x0710, USCRIPT_SYRIAC },
+ { 0x0840, USCRIPT_MANDAIC },
+ { 0x078C, USCRIPT_THAANA },
+ { 0x07CA, USCRIPT_NKO },
+ { 0x2D5E, USCRIPT_TIFINAGH },
+ { 0x12A0, USCRIPT_ETHIOPIC },
+ { 0x0905, USCRIPT_DEVANAGARI },
+ { 0x0995, USCRIPT_BENGALI },
+ { 0x0A15, USCRIPT_GURMUKHI },
+ { 0x0A95, USCRIPT_GUJARATI },
+ { 0x0B15, USCRIPT_ORIYA },
+ { 0x0B95, USCRIPT_TAMIL },
+ { 0x0C15, USCRIPT_TELUGU },
+ { 0x0C95, USCRIPT_KANNADA },
+ { 0x0D15, USCRIPT_MALAYALAM },
+ { 0x0D85, USCRIPT_SINHALA },
+ { 0xABC0, USCRIPT_MEITEI_MAYEK },
+ { 0xA800, USCRIPT_SYLOTI_NAGRI },
+ { 0xA882, USCRIPT_SAURASHTRA },
+ { 0x11083, USCRIPT_KAITHI },
+ { 0x11152, USCRIPT_MAHAJANI },
+ { 0x11183, USCRIPT_SHARADA },
+ { 0x11208, USCRIPT_KHOJKI },
+ { 0x112BE, USCRIPT_KHUDAWADI },
+ { 0x11315, USCRIPT_GRANTHA },
+ { 0x11484, USCRIPT_TIRHUTA },
+ { 0x1158E, USCRIPT_SIDDHAM },
+ { 0x1160E, USCRIPT_MODI },
+ { 0x11680, USCRIPT_TAKRI },
+ { 0x1B83, USCRIPT_SUNDANESE },
+ { 0x11005, USCRIPT_BRAHMI },
+ { 0x10A00, USCRIPT_KHAROSHTHI },
+ { 0x0E17, USCRIPT_THAI },
+ { 0x0EA5, USCRIPT_LAO },
+ { 0xAA80, USCRIPT_TAI_VIET },
+ { 0x0F40, USCRIPT_TIBETAN },
+ { 0x1C00, USCRIPT_LEPCHA },
+ { 0xA840, USCRIPT_PHAGS_PA },
+ { 0x1900, USCRIPT_LIMBU },
+ { 0x1703, USCRIPT_TAGALOG },
+ { 0x1723, USCRIPT_HANUNOO },
+ { 0x1743, USCRIPT_BUHID },
+ { 0x1763, USCRIPT_TAGBANWA },
+ { 0x1A00, USCRIPT_BUGINESE },
+ { 0x1BC0, USCRIPT_BATAK },
+ { 0xA930, USCRIPT_REJANG },
+ { 0xA90A, USCRIPT_KAYAH_LI },
+ { 0x1000, USCRIPT_MYANMAR },
+ { 0x11103, USCRIPT_CHAKMA },
+ { 0x1780, USCRIPT_KHMER },
+ { 0x1950, USCRIPT_TAI_LE },
+ { 0x1980, USCRIPT_NEW_TAI_LUE },
+ { 0x1A20, USCRIPT_LANNA },
+ { 0xAA00, USCRIPT_CHAM },
+ { 0x1B05, USCRIPT_BALINESE },
+ { 0xA984, USCRIPT_JAVANESE },
+ { 0x1826, USCRIPT_MONGOLIAN },
+ { 0x1C5A, USCRIPT_OL_CHIKI },
+ { 0x13C4, USCRIPT_CHEROKEE },
+ { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
+ { 0x168F, USCRIPT_OGHAM },
+ { 0x16A0, USCRIPT_RUNIC },
+ { 0x10C00, USCRIPT_ORKHON },
+ { 0xA549, USCRIPT_VAI },
+ { 0xA6A0, USCRIPT_BAMUM },
+ { 0x16AE6, USCRIPT_BASSA_VAH },
+ { 0x1E802, USCRIPT_MENDE },
+ { 0xAC00, USCRIPT_HANGUL },
+ { 0x304B, USCRIPT_HIRAGANA },
+ { 0x30AB, USCRIPT_KATAKANA },
+ { 0x3105, USCRIPT_BOPOMOFO },
+ { 0xA288, USCRIPT_YI },
+ { 0xA4D0, USCRIPT_LISU },
+ { 0x16F00, USCRIPT_MIAO },
+ { 0x118B4, USCRIPT_WARANG_CITI },
+ { 0x11AC0, USCRIPT_PAU_CIN_HAU },
+ { 0x16B1C, USCRIPT_PAHAWH_HMONG },
+ { 0x10280, USCRIPT_LYCIAN },
+ { 0x102A0, USCRIPT_CARIAN },
+ { 0x10920, USCRIPT_LYDIAN },
+ { 0x10300, USCRIPT_OLD_ITALIC },
+ { 0x10330, USCRIPT_GOTHIC },
+ { 0x10414, USCRIPT_DESERET },
+ { 0x10450, USCRIPT_SHAVIAN },
+ { 0x1BC20, USCRIPT_DUPLOYAN },
+ { 0x10480, USCRIPT_OSMANYA },
+ { 0x10500, USCRIPT_ELBASAN },
+ { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
+ { 0x110D0, USCRIPT_SORA_SOMPENG },
+ { 0x16A4F, USCRIPT_MRO },
+ { 0x10000, USCRIPT_LINEAR_B },
+ { 0x10647, USCRIPT_LINEAR_A },
+ { 0x10800, USCRIPT_CYPRIOT },
+ { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
+ { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
+ { 0x10B00, USCRIPT_AVESTAN },
+ { 0x10873, USCRIPT_PALMYRENE },
+ { 0x10896, USCRIPT_NABATAEAN },
+ { 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
+ { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
+ { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
+ { 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
+ { 0x10AD8, USCRIPT_MANICHAEAN },
+ { 0x10380, USCRIPT_UGARITIC },
+ { 0x103A0, USCRIPT_OLD_PERSIAN },
+ { 0x12000, USCRIPT_CUNEIFORM },
+ { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
+ { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
+ { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
+ { 0x5B57, USCRIPT_HAN },
+ { 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights
};
-int32_t getReorderCode(const char* name) {
- int32_t code = CollationRuleParser::getReorderCode(name);
- if (code >= 0) {
- return code;
- }
- for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) {
- if (0 == strcmp(name, specialReorderTokens[i].name)) {
- return specialReorderTokens[i].code;
+static int32_t getCharScript(UChar32 c) {
+ for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
+ if(c == sampleCharsToScripts[i].sampleChar) {
+ return sampleCharsToScripts[i].script;
}
}
- return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
+ return USCRIPT_INVALID_CODE; // -1
}
/**
};
static int64_t getOptionValue(const char *name) {
- for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
+ for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
if(uprv_strcmp(name, vt[i].name) == 0) {
return vt[i].value;
}
return 0;
}
-static UnicodeString *leadByteScripts = NULL;
-
static void readAnOption(
CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
- for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
+ for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
ActionType what_to_do = vt[cnt].what_to_do;
fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
}
} else if (what_to_do == READLEADBYTETOSCRIPTS) {
- uint16_t leadByte = (hex2num(*pointer++) * 16);
- leadByte += hex2num(*pointer++);
-
- if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
- // Extend the Hani range to the end of what this implementation uses.
- // FractionalUCA.txt assumes a different algorithm for implicit primary weights,
- // and different high-lead byte ranges.
- leadByteScripts[leadByte] = leadByteScripts[0xdf];
- return;
- }
-
- UnicodeString scripts;
- for(;;) {
- pointer = skipWhiteSpace(pointer);
- if (*pointer == ']') {
- break;
- }
- const char *scriptName = pointer;
- char c;
- while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
- if(c == 0) {
- fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
- *status = U_INVALID_FORMAT_ERROR;
- return;
- }
- *pointer = 0;
- int32_t reorderCode = getReorderCode(scriptName);
- *pointer = c;
- if (reorderCode == -3) { // COMPRESS
- builder.setCompressibleLeadByte(leadByte);
- continue;
- }
- if (reorderCode == -2) {
- continue; // Ignore "TERMINATOR" etc.
- }
- if (reorderCode < 0 || 0xffff < reorderCode) {
- fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
- *status = U_INVALID_FORMAT_ERROR;
- return;
- }
- scripts.append((UChar)reorderCode);
- }
- if(!scripts.isEmpty()) {
- if(leadByteScripts == NULL) {
- leadByteScripts = new UnicodeString[256];
- }
- leadByteScripts[leadByte] = scripts;
+ if (strstr(pointer, "COMPRESS") != NULL) {
+ uint16_t leadByte = (hex2num(*pointer++) * 16);
+ leadByte += hex2num(*pointer++);
+ builder.setCompressibleLeadByte(leadByte);
}
+ // We do not need the list of scripts on this line.
}
return;
}
// are only entered into the inverse table,
// not into the normal collation data.
builder.addRootElements(ces, cesLength, *status);
- if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
- // Lead byte for numeric sorting.
- builder.setNumericPrimary(p);
+ if(s.length() == 2 && cesLength == 1) {
+ switch(s[1]) {
+ case 0x34:
+ // Lead byte for numeric sorting.
+ builder.setNumericPrimary(p);
+ break;
+ case 0xFF21:
+ builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
+ break;
+ case 0xFF3A:
+ builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
+ break;
+ default:
+ break;
+ }
}
} else {
UChar32 c = s.char32At(0);
// CollationBaseDataBuilder::init() maps them to special CEs.
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
if(0xfffd <= c && c <= 0xffff) { continue; }
- if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
- continue;
+ if(s.length() >= 2 && c == 0xFDD1) {
+ UChar32 c2 = s.char32At(1);
+ int32_t script = getCharScript(c2);
+ if(script < 0) {
+ fprintf(stderr,
+ "Error: Unknown script for first-primary sample character "
+ "U+%04x on line %u of %s\n"
+ " (add the character to genuca.cpp sampleCharsToScripts[])\n",
+ c2, (int)line, filename);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ if(script == USCRIPT_UNKNOWN) {
+ // FDD1 FDD0, first unassigned-implicit primary
+ builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
+ continue;
+ }
+ builder.addScriptStart(script, p);
+ if(script == USCRIPT_HIRAGANA) {
+ builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
+ } else if(script == USCRIPT_HAN) {
+ builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
+ builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
+ }
}
if(0xe0000000 <= p && p < 0xf0000000) {
return;
}
- if(leadByteScripts != NULL) {
- uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
- do {
- // Find the range of lead bytes with this set of scripts.
- const UnicodeString &firstScripts = leadByteScripts[firstLead];
- if(firstScripts.isEmpty()) {
- fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
- errorCode = U_INVALID_FORMAT_ERROR;
- return;
- }
- uint32_t lead = firstLead;
- for(;;) {
- ++lead;
- const UnicodeString &scripts = leadByteScripts[lead];
- // The scripts should either be the same or disjoint.
- // We do not test if all reordering groups have disjoint sets of scripts.
- if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
- if(scripts != firstScripts) {
- fprintf(stderr,
- "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
- "but not all scripts match\n",
- (int)firstLead, scripts[0], (int)lead);
- errorCode = U_INVALID_FORMAT_ERROR;
- return;
- }
- }
- // lead is one greater than the last lead byte with the same set of scripts as firstLead.
- builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- firstLead = lead;
- } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
- delete[] leadByteScripts;
- }
-
CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
builder.enableFastLatin();
builder.build(data, errorCode);
main(int argc, char* argv[]) {
U_MAIN_INIT_ARGS(argc, argv);
- argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
+ argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
/* error handling, printing usage message */
if(argc<0) {