From: Markus Scherer Date: Mon, 24 Feb 2014 22:17:04 +0000 (+0000) Subject: ICU-9101 copy icu/branches/markus/collv2/source/tools/genuca2/genuca2.cpp to tools... X-Git-Tag: milestone-59-0-1~2149 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f6de8d8a1cf59eb8837ac09472d797cb443b97ef;p=icu ICU-9101 copy icu/branches/markus/collv2/source/tools/genuca2/genuca2.cpp to tools/trunk/unicode/c/genuca/genuca.cpp X-SVN-Rev: 35220 --- diff --git a/tools/unicode/c/genuca/genuca.8.in b/tools/unicode/c/genuca/genuca.8.in index e8ab27d0175..80fad66f261 100644 --- a/tools/unicode/c/genuca/genuca.8.in +++ b/tools/unicode/c/genuca/genuca.8.in @@ -2,90 +2,54 @@ .\" .\" genuca.8: manual page for the genuca utility .\" -.\" Copyright (C) 2000-2001 IBM, Inc. and others. +.\" Copyright (C) 2000-2014 IBM, Inc. and others. .\" -.TH GENUCA 8 "22 February 2001" "ICU MANPAGE" "ICU @VERSION@ Manual" +.TH GENUCA 8 "2014-Feb-24" "ICU MANPAGE" "ICU @VERSION@ Manual" .SH NAME .B genuca -\- create the UCA data table +\- create the root collation data file for ICU .SH SYNOPSIS .B genuca [ .BR "\-h\fP, \fB\-?\fP, \fB\-\-help" ] [ -.BR "\-V\fP, \fB\-\-version" -] -[ .BR "\-v\fP, \fB\-\-verbose" ] [ .BI "\-c\fP, \fB\-\-copyright" ] [ -.BI "\-s\fP, \fB\-\-sourcedir" " source" -] -[ -.BI "\-d\fP, \fB\-\-destdir" " destination" -] -[ -.IR file +.IR path/to/ICU/src/root ] .SH DESCRIPTION .B genuca -compiles the Unicode Collation Algorithm (UCA) data from -.I file -(or from -.B FractionalUCA.txt -if -.I file -is omitted) into its binary form, the files +reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and +writes source and binary data files with the collation root data. + +The binary file .B ucadata.dat -and -.BR invuca.dat . -These binary files can then be read directly by ICU, or used by +can then be read directly by ICU, or used by +.BR icupkg (8) +or .BR pkgdata (8) for incorporation into a larger archive or library. + +See http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Data_Files .SH OPTIONS .TP .BR "\-h\fP, \fB\-?\fP, \fB\-\-help" Print help about usage and exit. .TP -.BR "\-V\fP, \fB\-\-version" -Print the version of -.B genuca -and exit. -.TP .BR "\-v\fP, \fB\-\-verbose" Display extra informative messages during execution. .TP .BI "\-c\fP, \fB\-\-copyright" Include a copyright notice into the binary data. -.TP -.BI "\-s\fP, \fB\-\-sourcedir" " source" -Set the source directory to -.IR source . -The default source directory is specified by the environment variable -.BR ICU_DATA . -.TP -.BI "\-d\fP, \fB\-\-destdir" " destination" -Set the destination directory to -.IR destination . -The default destination directory is specified by the environment variable -.BR ICU_DATA . -.SH ENVIRONMENT -.TP 10 -.B ICU_DATA -Specifies the directory containing ICU data. Defaults to -.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . -Some tools in ICU depend on the presence of the trailing slash. It is thus -important to make sure that it is present if -.B ICU_DATA -is set. .SH FILES .TP 15 .B FractionalUCA.txt -Machine-readable file containing data for the Unicode collation algorithm. +Machine-readable file containing data for the CLDR root collation order. .SH VERSION @VERSION@ .SH COPYRIGHT diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp index c1f210c0861..b13193678df 100644 --- a/tools/unicode/c/genuca/genuca.cpp +++ b/tools/unicode/c/genuca/genuca.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2013, International Business Machines +* Copyright (C) 2000-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -11,502 +11,192 @@ * indentation:4 * * created at the end of XX century -* created by: Vladimir Weinstein +* created by: Vladimir Weinstein, +* modified in 2013-2014 by Markus Scherer * -* This program reads the Franctional UCA table and generates +* This program reads the Fractional UCA table and generates * internal format for UCA table as well as inverse UCA table. -* It then writes binary files containing the data: ucadata.dat -* & invuca.dat -* Change history: -* 02/23/2001 grhoten Made it into a tool -* 02/23/2001 weiv Moved element & table handling code to i18n -* 05/09/2001 weiv Case bits are now in the CEs, not in front -* 10/26/2010 sgill Support for reordering codes +* It then writes the ucadata.icu binary file containing the data. */ #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 +#include #include "unicode/utypes.h" -#include "unicode/putil.h" -#include "unicode/udata.h" -#include "unicode/uclean.h" -#include "unicode/uscript.h" -#include "unicode/ustring.h" -#include "unicode/utf16.h" +#include "unicode/errorcode.h" +#include "unicode/localpointer.h" #include "charstr.h" -#include "ucol_bld.h" -#include "ucol_imp.h" -#include "genuca.h" -#include "uoptions.h" -#include "uparse.h" +#include "cmemory.h" +#include "collation.h" +#include "collationbasedatabuilder.h" +#include "collationdata.h" +#include "collationdatabuilder.h" +#include "collationdatareader.h" +#include "collationdatawriter.h" +#include "collationinfo.h" +#include "collationrootelements.h" +#include "collationruleparser.h" +#include "collationtailoring.h" +#include "cstring.h" +#include "normalizer2impl.h" #include "toolutil.h" #include "unewdata.h" -#include "cstring.h" -#include "cmemory.h" - -#include +#include "uoptions.h" +#include "uparse.h" +#include "writesrc.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) -/** The maximum UTF-16 length (number of UChars) in a UCA contraction. */ -static const int32_t MAX_UCA_CONTRACTION_LENGTH=4; - -// script reordering structures -typedef struct { - uint16_t reorderCode; - uint16_t offset; -} ReorderIndex; - -typedef struct { - uint16_t LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH; - uint16_t* LEAD_BYTE_TO_SCRIPTS_INDEX; - uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH; - uint16_t* LEAD_BYTE_TO_SCRIPTS_DATA; - uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET; - - uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH; - ReorderIndex* SCRIPT_TO_LEAD_BYTES_INDEX; - uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; - uint16_t SCRIPT_TO_LEAD_BYTES_DATA_LENGTH; - uint16_t* SCRIPT_TO_LEAD_BYTES_DATA; - uint16_t SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; -} LeadByteConstants; - -int ReorderIndexComparer(const void *a, const void *b) { - return reinterpret_cast(a)->reorderCode - reinterpret_cast(b)->reorderCode; -} - -/* - * Global - verbosity - */ -UBool beVerbose = FALSE; - -static UVersionInfo UCAVersion; - #if UCONFIG_NO_COLLATION -/* dummy UDataInfo cf. udata.h */ -static UDataInfo dummyDataInfo = { - sizeof(UDataInfo), - 0, - - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - U_SIZEOF_UCHAR, - 0, - - { 0, 0, 0, 0 }, /* dummy dataFormat */ - { 0, 0, 0, 0 }, /* dummy formatVersion */ - { 0, 0, 0, 0 } /* dummy dataVersion */ -}; +extern "C" int +main(int argc, char* argv[]) { + (void)argc; + (void)argv; + return 1; +} #else -static const UDataInfo ucaDataInfo={ - sizeof(UDataInfo), - 0, +U_NAMESPACE_USE - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - sizeof(UChar), - 0, +static UBool beVerbose=FALSE, withCopyright=TRUE; - {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3}, /* dataFormat="UCol" */ - /* 03/26/2002 bumped up version since format has changed */ - /* 09/16/2002 bumped up version since we went from UColAttributeValue */ - /* to int32_t in UColOptionSet */ - /* 05/13/2003 This one also updated since we added UCA and UCD versions */ - /* to header */ - /* 09/11/2003 Adding information required by data swapper */ - {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3}, /* formatVersion */ - {0, 0, 0, 0} /* dataVersion = Unicode Version*/ -}; +static UVersionInfo UCAVersion={ 0, 0, 0, 0 }; -static const UDataInfo invUcaDataInfo={ +static UDataInfo ucaDataInfo={ sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, - sizeof(UChar), + U_SIZEOF_UCHAR, 0, - {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3}, /* dataFormat="InvC" */ - /* 03/26/2002 bumped up version since format has changed */ - /* 04/29/2003 2.1 format - we have added UCA version to header */ - {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3}, /* formatVersion */ - {0, 0, 0, 0} /* dataVersion = Unicode Version*/ + { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" + { 4, 0, 0, 0 }, // formatVersion + { 6, 3, 0, 0 } // dataVersion }; -UCAElements le; +static char *skipWhiteSpace(char *s) { + while(*s == ' ' || *s == '\t') { ++s; } + return s; +} -// returns number of characters read -int32_t readElement(char **from, char *to, char separator, UErrorCode *status) { - if(U_FAILURE(*status)) { - return 0; +static int32_t hex2num(char hex) { + if(hex>='0' && hex <='9') { + return hex-'0'; + } else if(hex>='a' && hex<='f') { + return hex-'a'+10; + } else if(hex>='A' && hex<='F') { + return hex-'A'+10; + } else { + return -1; } - char buffer[1024]; - int32_t i = 0; +} + +static uint32_t parseWeight(char *&s, const char *separators, + int32_t maxBytes, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + uint32_t weight = 0; + int32_t numBytes = 0; for(;;) { - char c = **from; - if(c == separator || (separator == ' ' && c == '\t')) { + // Check one character after another, so that we don't just run over a 00. + int32_t nibble1, nibble2; + if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) { + // Stop when we find something other than a pair of hex digits. break; } - if (c == '\0') { + if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) { + // Too many bytes, or a 00 or 01 byte which is illegal inside a weight. + errorCode = U_INVALID_FORMAT_ERROR; return 0; } - if(c != ' ') { - *(buffer+i++) = c; + weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2; + ++numBytes; + s += 2; + if(*s != ' ') { + break; } - (*from)++; + ++s; } - (*from)++; - *(buffer + i) = 0; - //*to = (char *)malloc(strlen(buffer)+1); - strcpy(to, buffer); - return i; -} - -int32_t skipUntilWhiteSpace(char **from, UErrorCode *status) { - if (U_FAILURE(*status)) { + char c = *s; + if(c == 0 || strchr(separators, c) == NULL) { + errorCode = U_INVALID_FORMAT_ERROR; return 0; } - int32_t count = 0; - while (**from != ' ' && **from != '\t' && **from != '\0') { - (*from)++; - count++; + // numBytes==0 is ok, for example in [,,] or [, 82, 05] + // Left-align the weight. + while(numBytes < 4) { + weight <<= 8; + ++numBytes; } - return count; + return weight; } -int32_t skipWhiteSpace(char **from, UErrorCode *status) { - if (U_FAILURE(*status)) { - return 0; - } - int32_t count = 0; - while (**from == ' ' || **from == '\t') { - (*from)++; - count++; - } - return count; -} - -uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) { - if(U_FAILURE(*status)) { - return 0; - } - uint32_t value = 0; - char primsave = '\0'; - char secsave = '\0'; - char tersave = '\0'; - char *primend = primary+4; - if(strlen(primary) > 4) { - primsave = *primend; - *primend = '\0'; - } - char *secend = secondary+2; - if(strlen(secondary) > 2) { - secsave = *secend; - *secend = '\0'; - } - char *terend = tertiary+2; - if(strlen(tertiary) > 2) { - tersave = *terend; - *terend = '\0'; - } - uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0); - uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0); - uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0); - if(primvalue <= 0xFF) { - primvalue <<= 8; - } - - value = ((primvalue<CEs[1])) { - //printf("+"); - } - inversePos++; - inverseTable[inversePos][0] = element->CEs[0]; - if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { - inverseTable[inversePos][1] = element->CEs[1]; - } else { - inverseTable[inversePos][1] = 0; - } - if(element->cSize < 2) { - inverseTable[inversePos][2] = element->cPoints[0]; - } else { /* add a new store of cruft */ - inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos; - memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); - sContPos += element->cSize+1; - } -} - -static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) { - if(U_FAILURE(*status)) { - return; - } - - if(beVerbose && isContinuation(element->CEs[1])) { - //printf("+"); - } - if(position <= inversePos) { - /*move stuff around */ - uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]); - uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove); - } - inverseTable[position][0] = element->CEs[0]; - if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { - inverseTable[position][1] = element->CEs[1]; - } else { - inverseTable[position][1] = 0; - } - if(element->cSize < 2) { - inverseTable[position][2] = element->cPoints[0]; - } else { /* add a new store of cruft */ - inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos; - memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); - sContPos += element->cSize+1; - } - inversePos++; -} - -static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) { - - if(U_FAILURE(*status)) { - return; - } - - if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */ - stringContinue[sContPos] = (UChar)inverseTable[position][2]; - inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos; - sContPos++; - stringContinue[sContPos++] = 0xFFFF; - memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); - sContPos += element->cSize; - stringContinue[sContPos++] = 0xFFFE; - } else { /* adding to the already existing continuing table */ - uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK; - uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE; - - if(contIndex+contSize < sContPos) { - /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/ - memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar)); - } - - stringContinue[contIndex+contSize-1] = 0xFFFF; - memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar)); - sContPos += element->cSize+1; - stringContinue[contIndex+contSize+element->cSize] = 0xFFFE; - - inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex; - } -} - -/* - * Takes two CEs (lead and continuation) and - * compares them as CEs should be compared: - * primary vs. primary, secondary vs. secondary - * tertiary vs. tertiary +/** + * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10]. + * Stop with an error, or else with the pointer s after the closing bracket. */ -static int32_t compareCEs(uint32_t *source, uint32_t *target) { - uint32_t s1 = source[0], s2, t1 = target[0], t2; - if(isContinuation(source[1])) { - s2 = source[1]; - } else { - s2 = 0; - } - if(isContinuation(target[1])) { - t2 = target[1]; - } else { - t2 = 0; - } - - uint32_t s = 0, t = 0; - if(s1 == t1 && s2 == t2) { - return 0; - } - s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); - t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); - if(s < t) { - return -1; - } else if(s > t) { - return 1; - } else { - s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; - t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; - if(s < t) { - return -1; - } else if(s > t) { - return 1; - } else { - s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); - t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); - if(s < t) { - return -1; - } else { - return 1; - } - } - } -} - -static uint32_t addToInverse(UCAElements *element, UErrorCode *status) { - uint32_t position = inversePos; - uint32_t saveElement = element->CEs[0]; - int32_t compResult = 0; - element->CEs[0] &= 0xFFFFFF3F; - if(element->noOfCEs == 1) { - element->CEs[1] = 0; - } - if(inversePos == 0) { - inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0; - addNewInverse(element, status); - } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) { - while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0); - if(beVerbose) { printf("p:%u ", (int)position); } - if(compResult == 0) { - addToExistingInverse(element, position, status); - } else { - insertInverse(element, position+1, status); - } - } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) { - addToExistingInverse(element, inversePos, status); - } else { - addNewInverse(element, status); - } - element->CEs[0] = saveElement; - if(beVerbose) { printf("+"); } - return inversePos; -} - -static InverseUCATableHeader *assembleInverseTable(UErrorCode *status) -{ - InverseUCATableHeader *result = NULL; - uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader)); - uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3; - uint32_t contsByteSize = sContPos * sizeof(UChar); - uint32_t i = 0; - - result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize); - uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize); - if(result != NULL) { - result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize; - - inversePos++; - inverseTable[inversePos][0] = 0xFFFFFFFF; - inverseTable[inversePos][1] = 0xFFFFFFFF; - inverseTable[inversePos][2] = 0x0000FFFF; - inversePos++; - - for(i = 2; i 0) { - fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]); - } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) { - fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]); - } - } - - result->tableSize = inversePos; - result->contsSize = sContPos; - - result->table = headerByteSize; - result->conts = headerByteSize + inverseTableByteSize; - - memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize); - memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize); - - } else { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - return result; -} - - -static void writeOutInverseData(InverseUCATableHeader *data, - const char *outputDir, - const char *copyright, - UErrorCode *status) -{ - UNewDataMemory *pData; - - long dataLength; - - UDataInfo invUcaInfo; - uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo)); - uprv_memcpy(invUcaInfo.dataVersion, UCAVersion, U_MAX_VERSION_LENGTH); - - pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo, - copyright, status); - - if(U_FAILURE(*status)) { - fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); - return; - } - - /* write the data to the file */ - if (beVerbose) { - printf("Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR, - INVC_DATA_NAME, - INVC_DATA_TYPE); - } - udata_writeBlock(pData, data, data->byteSize); - - /* finish up */ - dataLength=udata_finish(pData, status); - if(U_FAILURE(*status)) { - fprintf(stderr, "Error: error %d writing the output file\n", *status); - return; - } -} - -static int32_t hex2num(char hex) { - if(hex>='0' && hex <='9') { - return hex-'0'; - } else if(hex>='a' && hex<='f') { - return hex-'a'+10; - } else if(hex>='A' && hex<='F') { - return hex-'A'+10; +static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + ++s; // skip over the '[' + if(s[0] == 'U' && s[1] == '+') { + // Read a code point and look up its CE. + // We use this especially for implicit primary weights, + // so that we can use different algorithms in the FractionalUCA.txt + // generator and the parser. + // The generator may not even need to compute any implicit primaries at all. + s += 2; + char *end; + unsigned long longCp = uprv_strtoul(s, &end, 16); + if(end == s || longCp > 0x10ffff) { + errorCode = U_INVALID_FORMAT_ERROR; + return 0; + } + UChar32 c = (UChar32)longCp; + int64_t ce = builder.getSingleCE(c, errorCode); + if(U_FAILURE(errorCode)) { return 0; } + s = end; + if(*s == ']') { // [U+4E00] + ++s; + return ce; + } + if(*s != ',') { + errorCode = U_INVALID_FORMAT_ERROR; + return 0; + } + // Parse the following, secondary or tertiary weight. + s = skipWhiteSpace(s + 1); + uint32_t w = parseWeight(s, ",]", 2, errorCode); + if(U_FAILURE(errorCode)) { return 0; } + if(*s == ']') { // [U+4E00, 10] + ++s; + // Set the tertiary weight to w. + return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16); + } + // Set the secondary weight to w: [U+9F9C, 70, 20] + ce = (ce & INT64_C(0xffffffff00000000)) | w; + // Parse and set the tertiary weight. + s = skipWhiteSpace(s + 1); + w = parseWeight(s, "]", 2, errorCode); + ++s; + return ce | (w >> 16); } else { - return 0; + uint32_t p = parseWeight(s, ",", 4, errorCode); + if(U_FAILURE(errorCode)) { return 0; } + int64_t ce = (int64_t)p << 32; + s = skipWhiteSpace(s + 1); + uint32_t w = parseWeight(s, ",", 2, errorCode); + if(U_FAILURE(errorCode)) { return 0; } + ce |= w; + s = skipWhiteSpace(s + 1); + w = parseWeight(s, "]", 2, errorCode); + ++s; + return ce | (w >> 16); } } -// static char* CHARACTER_CATEGORY_REORDER_CODES[] = { -// "Zs", "Nd", "Sc" -// }; -// static const uint16_t CHARACTER_CATEGORY_REORDER_CODE_OFFSET = 0x1000; -// static uint16_t CHARACTER_CATEGORY_REORDER_CODES_VALUE[] = { -// U_SPACE_SEPARATOR + CHARACTER_CATEGORY_REORDER_CODE_OFFSET, -// U_DECIMAL_DIGIT_NUMBER + CHARACTER_CATEGORY_REORDER_CODE_OFFSET, -// U_CURRENCY_SYMBOL + CHARACTER_CATEGORY_REORDER_CODE_OFFSET -// }; - static const struct { const char *name; int32_t code; @@ -514,7 +204,8 @@ static const struct { { "TERMINATOR", -2 }, // -2 means "ignore" { "LEVEL-SEPARATOR", -2 }, { "FIELD-SEPARATOR", -2 }, - { "COMPRESS", -2 }, // TODO: We should parse/store which lead bytes are compressible; there is a ticket for that. + { "COMPRESS", -3 }, + // The standard name is "PUNCT" but FractionalUCA.txt uses the long form. { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION }, { "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte. { "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits). @@ -522,11 +213,7 @@ static const struct { }; int32_t getReorderCode(const char* name) { - int32_t code = ucol_findReorderingEntry(name); - if (code >= 0) { - return code; - } - code = u_getPropertyValueEnum(UCHAR_SCRIPT, name); + int32_t code = CollationRuleParser::getReorderCode(name); if (code >= 0) { return code; } @@ -538,985 +225,818 @@ int32_t getReorderCode(const char* name) { return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE. } -UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, LeadByteConstants *leadByteConstants, UErrorCode *status) { - static int itemsToDataBlock = 0; - static int scriptDataWritten = 0; - char buffer[2048], primary[100], secondary[100], tertiary[100]; - UChar uBuffer[2048]; - UChar uBuffer2[2048]; - UChar leadByte[100], scriptCode[100]; - int32_t i = 0; - unsigned int theValue; - char *pointer = NULL; - char *commentStart = NULL; - char *startCodePoint = NULL; - char *endCodePoint = NULL; - char *result = fgets(buffer, 2048, data); - int32_t buflen = (int32_t)uprv_strlen(buffer); - if(U_FAILURE(*status)) { - return 0; - } - *primary = *secondary = *tertiary = '\0'; - *leadByte = *scriptCode = '\0'; - if(result == NULL) { - if(feof(data)) { - return NULL; - } else { - fprintf(stderr, "empty line but no EOF!\n"); - *status = U_INVALID_FORMAT_ERROR; - return NULL; - } - } - while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) { - buffer[--buflen] = 0; - } +enum ActionType { + READCE, + READPRIMARY, + READBYTE, + READUNIFIEDIDEOGRAPH, + READUCAVERSION, + READLEADBYTETOSCRIPTS, + IGNORE +}; - if(buffer[0] == 0 || buffer[0] == '#') { - return NULL; // just a comment, skip whole line - } +static struct { + const char *const name; + int64_t value; + const ActionType what_to_do; +} vt[] = { + {"[first tertiary ignorable", 0, IGNORE}, + {"[last tertiary ignorable", 0, IGNORE}, + {"[first secondary ignorable", 0, READCE}, + {"[last secondary ignorable", 0, READCE}, + {"[first primary ignorable", 0, READCE}, + {"[last primary ignorable", 0, READCE}, + {"[first variable", 0, READCE}, + {"[last variable", 0, READCE}, + {"[first regular", 0, READCE}, + {"[last regular", 0, READCE}, + {"[first implicit", 0, READCE}, + {"[last implicit", 0, READCE}, + {"[first trailing", 0, READCE}, + {"[last trailing", 0, READCE}, + + {"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH}, + + {"[fixed first implicit byte", 0, IGNORE}, + {"[fixed last implicit byte", 0, IGNORE}, + {"[fixed first trail byte", 0, IGNORE}, + {"[fixed last trail byte", 0, IGNORE}, + {"[fixed first special byte", 0, IGNORE}, + {"[fixed last special byte", 0, IGNORE}, + {"[fixed secondary common byte", 0, READBYTE}, + {"[fixed last secondary common byte", 0, READBYTE}, + {"[fixed first ignorable secondary byte", 0, READBYTE}, + {"[fixed tertiary common byte", 0, READBYTE}, + {"[fixed first ignorable tertiary byte", 0, READBYTE}, + {"[variable top = ", 0, IGNORE}, + {"[UCA version = ", 0, READUCAVERSION}, + {"[top_byte", 0, READLEADBYTETOSCRIPTS}, + {"[reorderingTokens", 0, IGNORE}, + {"[categories", 0, IGNORE}, + {"[first tertiary in secondary non-ignorable", 0, IGNORE}, + {"[last tertiary in secondary non-ignorable", 0, IGNORE}, + {"[first secondary in primary non-ignorable", 0, IGNORE}, + {"[last secondary in primary non-ignorable", 0, IGNORE}, +}; - UCAElements *element = ≤ - memset(element, 0, sizeof(*element)); +static int64_t getOptionValue(const char *name) { + for (int32_t i = 0; i < LENGTHOF(vt); ++i) { + if(uprv_strcmp(name, vt[i].name) == 0) { + return vt[i].value; + } + } + return 0; +} - enum ActionType { - READCE, - READHEX1, - READHEX2, - READUCAVERSION, - READLEADBYTETOSCRIPTS, - READSCRIPTTOLEADBYTES, - IGNORE, - }; +static UnicodeString *leadByteScripts = NULL; - // Directives. - if(buffer[0] == '[') { - uint32_t cnt = 0; - static const struct { - char name[128]; - uint32_t *what; - ActionType what_to_do; - } vt[] = { {"[first tertiary ignorable", consts->UCA_FIRST_TERTIARY_IGNORABLE, READCE}, - {"[last tertiary ignorable", consts->UCA_LAST_TERTIARY_IGNORABLE, READCE}, - {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE}, - {"[last secondary ignorable", consts->UCA_LAST_SECONDARY_IGNORABLE, READCE}, - {"[first primary ignorable", consts->UCA_FIRST_PRIMARY_IGNORABLE, READCE}, - {"[last primary ignorable", consts->UCA_LAST_PRIMARY_IGNORABLE, READCE}, - {"[first variable", consts->UCA_FIRST_VARIABLE, READCE}, - {"[last variable", consts->UCA_LAST_VARIABLE, READCE}, - {"[first regular", consts->UCA_FIRST_NON_VARIABLE, READCE}, - {"[last regular", consts->UCA_LAST_NON_VARIABLE, READCE}, - {"[first implicit", consts->UCA_FIRST_IMPLICIT, READCE}, - {"[last implicit", consts->UCA_LAST_IMPLICIT, READCE}, - {"[first trailing", consts->UCA_FIRST_TRAILING, READCE}, - {"[last trailing", consts->UCA_LAST_TRAILING, READCE}, - - {"[fixed top", &consts->UCA_PRIMARY_TOP_MIN, READHEX1}, - {"[fixed first implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MIN, READHEX1}, - {"[fixed last implicit byte", &consts->UCA_PRIMARY_IMPLICIT_MAX, READHEX1}, - {"[fixed first trail byte", &consts->UCA_PRIMARY_TRAILING_MIN, READHEX1}, - {"[fixed last trail byte", &consts->UCA_PRIMARY_TRAILING_MAX, READHEX1}, - {"[fixed first special byte", &consts->UCA_PRIMARY_SPECIAL_MIN, READHEX1}, - {"[fixed last special byte", &consts->UCA_PRIMARY_SPECIAL_MAX, READHEX1}, - {"[variable top = ", &t->options->variableTopValue, READHEX2}, - {"[UCA version = ", NULL, READUCAVERSION}, - {"[top_byte", NULL, READLEADBYTETOSCRIPTS}, - {"[reorderingTokens", NULL, READSCRIPTTOLEADBYTES}, - {"[categories", NULL, IGNORE}, - {"[first tertiary in secondary non-ignorable", NULL, IGNORE}, - {"[last tertiary in secondary non-ignorable", NULL, IGNORE}, - {"[first secondary in primary non-ignorable", NULL, IGNORE}, - {"[last secondary in primary non-ignorable", NULL, IGNORE}, - }; - for (cnt = 0; cntsizePrim[0]=readElement(&pointer, primary, ',', status) / 2; - element->sizeSec[0]=readElement(&pointer, secondary, ',', status) / 2; - element->sizeTer[0]=readElement(&pointer, tertiary, ']', status) / 2; - vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status); - if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) { - uint32_t CEi = 1; - uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ - if(2*CEisizePrim[i]) { - value |= ((hex2num(*(primary+4*CEi))&0xF)<<28); - value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24); - } - - if(2*CEi+1sizePrim[i]) { - value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20); - value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16); - } - - if(CEisizeSec[i]) { - value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12); - value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8); + vt[cnt].value = parseCE(builder, pointer, *status); + if(U_SUCCESS(*status) && *pointer != ']') { + *status = U_INVALID_FORMAT_ERROR; + } + if(U_FAILURE(*status)) { + fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer); + return; + } + } else if(what_to_do == READPRIMARY) { + vt[cnt].value = parseWeight(pointer, "]", 4, *status); + if(U_FAILURE(*status)) { + fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer); + return; + } + } else if(what_to_do == READBYTE) { + vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24; + if(U_FAILURE(*status)) { + fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer); + return; + } + } else if(what_to_do == READUNIFIEDIDEOGRAPH) { + UVector32 unihan(*status); + if(U_FAILURE(*status)) { return; } + for(;;) { + if(*pointer == ']') { break; } + if(*pointer == 0) { + // Missing ] after ranges. + *status = U_INVALID_FORMAT_ERROR; + return; } - - if(CEisizeTer[i]) { - value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4); - value |= (hex2num(*(tertiary+2*CEi+1))&0xF); + char *s = pointer; + while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; } + char c = *s; + *s = 0; + uint32_t start, end; + u_parseCodePointRange(pointer, &start, &end, status); + *s = c; + if(U_FAILURE(*status)) { + fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer); + *status = U_INVALID_FORMAT_ERROR; + return; } - - CEi++; - - vt[cnt].what[1] = value; - //element->CEs[CEindex++] = value; - } else { - vt[cnt].what[1] = 0; + unihan.addElement((UChar32)start, *status); + unihan.addElement((UChar32)end, *status); + pointer = skipWhiteSpace(s); } - } else { - fprintf(stderr, "Failed to read a CE from line %s\n", buffer); - } - } else if (what_to_do == READUCAVERSION) { //vt[cnt].what_to_do == READUCAVERSION - u_versionFromString(UCAVersion, buffer+vtLen); - if(beVerbose) { - char uca[U_MAX_VERSION_STRING_LENGTH]; - u_versionToString(UCAVersion, uca); - printf("UCA version %s\n", uca); - } - UVersionInfo UCDVersion; - u_getUnicodeVersion(UCDVersion); - if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) { - char uca[U_MAX_VERSION_STRING_LENGTH]; - char ucd[U_MAX_VERSION_STRING_LENGTH]; - u_versionToString(UCAVersion, uca); - u_versionToString(UCDVersion, ucd); - // Warning, not error, to permit bootstrapping during a version upgrade. - fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd); - // *status = U_INVALID_FORMAT_ERROR; - // return NULL; - } - } else if (what_to_do == READLEADBYTETOSCRIPTS) { //vt[cnt].what_to_do == READLEADBYTETOSCRIPTS - pointer = buffer + vtLen; - skipWhiteSpace(&pointer, status); - - uint16_t leadByte = (hex2num(*pointer++) * 16) + hex2num(*pointer++); - //printf("~~~~ processing lead byte = %02x\n", leadByte); - if (leadByte >= leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH) { - fprintf(stderr, "Lead byte larger than allocated table!"); - // set status and return - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; + builder.initHanRanges(unihan.getBuffer(), unihan.size(), *status); + } else if (what_to_do == READUCAVERSION) { + u_versionFromString(UCAVersion, pointer); + if(beVerbose) { + char uca[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(UCAVersion, uca); + printf("UCA version %s\n", uca); } - skipWhiteSpace(&pointer, status); - - int32_t reorderCodeArray[100]; - uint32_t reorderCodeArrayCount = 0; - char scriptName[100]; - int32_t elementLength = 0; - while ((elementLength = readElement(&pointer, scriptName, ' ', status)) > 0) { - if (scriptName[0] == ']') { + UVersionInfo UCDVersion; + u_getUnicodeVersion(UCDVersion); + if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) { + char uca[U_MAX_VERSION_STRING_LENGTH]; + char ucd[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(UCAVersion, uca); + u_versionToString(UCDVersion, ucd); + // Warning, not error, to permit bootstrapping during a version upgrade. + fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd); + } + } else if (what_to_do == READLEADBYTETOSCRIPTS) { + uint16_t leadByte = (hex2num(*pointer++) * 16); + leadByte += hex2num(*pointer++); + + if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) { + // Extend the Hani range to the end of what this implementation uses. + // FractionalUCA.txt assumes a different algorithm for implicit primary weights, + // and different high-lead byte ranges. + leadByteScripts[leadByte] = leadByteScripts[0xdf]; + return; + } + + UnicodeString scripts; + for(;;) { + pointer = skipWhiteSpace(pointer); + if (*pointer == ']') { break; } + const char *scriptName = pointer; + char c; + while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; } + if(c == 0) { + fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer); + *status = U_INVALID_FORMAT_ERROR; + return; + } + *pointer = 0; int32_t reorderCode = getReorderCode(scriptName); + *pointer = c; + if (reorderCode == -3) { // COMPRESS + builder.setCompressibleLeadByte(leadByte); + continue; + } if (reorderCode == -2) { continue; // Ignore "TERMINATOR" etc. } - if (reorderCode < 0) { - printf("Syntax error: unable to parse reorder code from '%s'\n", scriptName); + if (reorderCode < 0 || 0xffff < reorderCode) { + fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName); *status = U_INVALID_FORMAT_ERROR; - return NULL; + return; } - if (reorderCodeArrayCount >= LENGTHOF(reorderCodeArray)) { - printf("reorder code array count is greater than allocated size!\n"); - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; - } - reorderCodeArray[reorderCodeArrayCount++] = reorderCode; - } - //printf("reorderCodeArrayCount = %d\n", reorderCodeArrayCount); - switch (reorderCodeArrayCount) { - case 0: - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = 0; - break; - case 1: - // TODO = move 0x8000 into defined constant - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = 0x8000 | reorderCodeArray[0]; - break; - default: - if (reorderCodeArrayCount + leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET > leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH) { - // Error condition - } - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET; - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET++] = reorderCodeArrayCount; - for (int reorderCodeIndex = 0; reorderCodeIndex < reorderCodeArrayCount; reorderCodeIndex++) { - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET++] = reorderCodeArray[reorderCodeIndex]; - } + scripts.append((UChar)reorderCode); } - } else if (what_to_do == READSCRIPTTOLEADBYTES) { //vt[cnt].what_to_do == READSCRIPTTOLEADBYTES - uint16_t leadByteArray[256]; - uint32_t leadByteArrayCount = 0; - char scriptName[100]; - - pointer = buffer + vtLen; - skipWhiteSpace(&pointer, status); - uint32_t scriptNameLength = readElement(&pointer, scriptName, '\t', status); - int32_t reorderCode = getReorderCode(scriptName); - if (reorderCode >= 0) { - //printf("^^^ processing reorder code = %04x (%s)\n", reorderCode, scriptName); - skipWhiteSpace(&pointer, status); - - int32_t elementLength = 0; - char leadByteString[100]; - while ((elementLength = readElement(&pointer, leadByteString, '=', status)) == 2) { - //printf("\tleadByteArrayCount = %d, elementLength = %d, leadByteString = %s\n", leadByteArrayCount, elementLength, leadByteString); - uint32_t leadByte = (hex2num(leadByteString[0]) * 16) + hex2num(leadByteString[1]); - leadByteArray[leadByteArrayCount++] = (uint16_t) leadByte; - skipUntilWhiteSpace(&pointer, status); - } - - if (leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT >= leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH) { - //printf("\tError condition\n"); - //printf("\tindex count = %d, total index size = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT, sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX) / sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0])); - // Error condition - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; + if(!scripts.isEmpty()) { + if(leadByteScripts == NULL) { + leadByteScripts = new UnicodeString[256]; } - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].reorderCode = reorderCode; - - //printf("\tlead byte count = %d\n", leadByteArrayCount); - //printf("\tlead byte array = "); - //for (int i = 0; i < leadByteArrayCount; i++) { - // printf("%02x, ", leadByteArray[i]); - //} - //printf("\n"); - - switch (leadByteArrayCount) { - case 0: - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = 0; - break; - case 1: - // TODO = move 0x8000 into defined constant - //printf("\t+++++ lead byte = &x\n", leadByteArray[0]); - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = 0x8000 | leadByteArray[0]; - break; - default: - //printf("\t+++++ lead bytes written to data block - %d\n", itemsToDataBlock++); - //printf("\tlead bytes = "); - //for (int i = 0; i < leadByteArrayCount; i++) { - // printf("%02x, ", leadByteArray[i]); - //} - //printf("\n"); - //printf("\tBEFORE data bytes = "); - //for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) { - // printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]); - //} - //printf("\n"); - //printf("\tdata offset = %d, data length = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_LENGTH); - if ((leadByteArrayCount + leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET) > leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_LENGTH) { - //printf("\tError condition\n"); - // Error condition - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; - } - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; - leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET++] = leadByteArrayCount; - scriptDataWritten++; - memcpy(&leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET], - leadByteArray, leadByteArrayCount * sizeof(leadByteArray[0])); - scriptDataWritten += leadByteArrayCount; - //printf("\tlead byte data written = %d\n", scriptDataWritten); - //printf("\tcurrentIndex.reorderCode = %04x, currentIndex.offset = %04x\n", - // leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.offset); - leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET += leadByteArrayCount; - //printf("\tdata offset = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET); - //printf("\tAFTER data bytes = "); - //for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) { - // printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]); - //} - //printf("\n"); - } - //if (reorderCode >= 0x1000) { - // printf("@@@@ reorderCode = %x, offset = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset); - // for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) { - // printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]); - // } - // printf("\n"); - // } - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT++; + leadByteScripts[leadByte] = scripts; } } - return NULL; + return; + } + } + fprintf(stderr, "Warning: unrecognized option: %s\n", buffer); +} + +static UBool +readAnElement(FILE *data, + CollationBaseDataBuilder &builder, + UnicodeString &prefix, UnicodeString &s, + int64_t ces[32], int32_t &cesLength, + UErrorCode *status) { + if(U_FAILURE(*status)) { + return FALSE; + } + char buffer[2048]; + char *result = fgets(buffer, 2048, data); + if(result == NULL) { + if(feof(data)) { + return FALSE; + } else { + fprintf(stderr, "empty line but no EOF!\n"); + *status = U_INVALID_FORMAT_ERROR; + return FALSE; } - } - fprintf(stderr, "Warning: unrecognized option: %s\n", buffer); - //*status = U_INVALID_FORMAT_ERROR; - return NULL; + } + int32_t buflen = (int32_t)uprv_strlen(buffer); + while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) { + buffer[--buflen] = 0; + } + + if(buffer[0] == 0 || buffer[0] == '#') { + return FALSE; // just a comment, skip whole line } - startCodePoint = buffer; - endCodePoint = strchr(startCodePoint, ';'); + // Directives. + if(buffer[0] == '[') { + readAnOption(builder, buffer, status); + return FALSE; + } - if(endCodePoint == 0) { + char *startCodePoint = buffer; + char *endCodePoint = strchr(startCodePoint, ';'); + if(endCodePoint == NULL) { fprintf(stderr, "error - line with no code point!\n"); *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ - return NULL; + return FALSE; } else { - *(endCodePoint) = 0; + *endCodePoint = 0; } char *pipePointer = strchr(buffer, '|'); if (pipePointer != NULL) { // Read the prefix string which precedes the actual string. *pipePointer = 0; - element->prefixSize = + UChar *prefixChars = prefix.getBuffer(32); + int32_t prefixSize = u_parseString(startCodePoint, - element->prefixChars, LENGTHOF(element->prefixChars), + prefixChars, prefix.getCapacity(), NULL, status); if(U_FAILURE(*status)) { + prefix.releaseBuffer(0); fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n", startCodePoint, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; - return NULL; + return FALSE; } - element->prefix = element->prefixChars; + prefix.releaseBuffer(prefixSize); startCodePoint = pipePointer + 1; } // Read the string which gets the CE(s) assigned. - element->cSize = + UChar *uchars = s.getBuffer(32); + int32_t cSize = u_parseString(startCodePoint, - element->uchars, LENGTHOF(element->uchars), + uchars, s.getCapacity(), NULL, status); if(U_FAILURE(*status)) { + s.releaseBuffer(0); fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n", startCodePoint, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; - return NULL; + return FALSE; } - element->cPoints = element->uchars; + s.releaseBuffer(cSize); - startCodePoint = endCodePoint+1; + char *pointer = endCodePoint + 1; - commentStart = strchr(startCodePoint, '#'); + char *commentStart = strchr(pointer, '#'); if(commentStart == NULL) { - commentStart = strlen(startCodePoint) + startCodePoint; + commentStart = strchr(pointer, 0); } - i = 0; - uint32_t CEindex = 0; - element->noOfCEs = 0; + cesLength = 0; for(;;) { - endCodePoint = strchr(startCodePoint, ']'); - if(endCodePoint == NULL || endCodePoint >= commentStart) { + pointer = skipWhiteSpace(pointer); + if(pointer == commentStart) { break; } - pointer = strchr(startCodePoint, '['); - pointer++; - - element->sizePrim[i]=readElement(&pointer, primary, ',', status) / 2; - element->sizeSec[i]=readElement(&pointer, secondary, ',', status) / 2; - element->sizeTer[i]=readElement(&pointer, tertiary, ']', status) / 2; - - - /* I want to get the CEs entered right here, including continuation */ - element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status); - - uint32_t CEi = 1; - while(2*CEisizePrim[i] || CEisizeSec[i] || CEisizeTer[i]) { - uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ - if(2*CEisizePrim[i]) { - value |= ((hex2num(*(primary+4*CEi))&0xF)<<28); - value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24); - } - - if(2*CEi+1sizePrim[i]) { - value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20); - value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16); - } - - if(CEisizeSec[i]) { - value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12); - value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8); - } - - if(CEisizeTer[i]) { - value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4); - value |= (hex2num(*(tertiary+2*CEi+1))&0xF); - } - - CEi++; - - element->CEs[CEindex++] = value; + if(cesLength >= 31) { + fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer); + *status = U_INVALID_FORMAT_ERROR; + return FALSE; } - - startCodePoint = endCodePoint+1; - i++; - } - element->noOfCEs = CEindex; -#if 0 - element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]); -#endif - // we don't want any strange stuff after useful data! - if (pointer == NULL) { - /* huh? Did we get ']' without the '['? Pair your brackets! */ - *status=U_INVALID_FORMAT_ERROR; - } - else { - while(pointer < commentStart) { - if(*pointer != ' ' && *pointer != '\t') - { - *status=U_INVALID_FORMAT_ERROR; - break; - } - pointer++; + ces[cesLength++] = parseCE(builder, pointer, *status); + if(U_FAILURE(*status)) { + fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n", + buffer, u_errorName(*status)); + return FALSE; } } - if(element->cSize == 1 && element->cPoints[0] == 0xfffe) { + + if(s.length() == 1 && s[0] == 0xfffe) { // UCA 6.0 gives U+FFFE a special minimum weight using the // byte 02 which is the merge-sort-key separator and illegal for any // other characters. } else { // Rudimentary check for valid bytes in CE weights. - // For a more comprehensive check see cintltst /tscoll/citertst/TestCEValidity - for (i = 0; i < (int32_t)CEindex; ++i) { - uint32_t value = element->CEs[i]; - uint8_t bytes[4] = { - (uint8_t)(value >> 24), - (uint8_t)(value >> 16), - (uint8_t)(value >> 8), - (uint8_t)(value & UCOL_NEW_TERTIARYORDERMASK) - }; - for (int j = 0; j < 4; ++j) { - if (0 != bytes[j] && bytes[j] < 3) { - fprintf(stderr, "Warning: invalid UCA weight byte %02X for %s\n", bytes[j], buffer); - return NULL; + // For a more comprehensive check see CollationTest::TestRootElements(), + // intltest collate/CollationTest/TestRootElements + for (int32_t i = 0; i < cesLength; ++i) { + int64_t ce = ces[i]; + UBool isCompressible = FALSE; + for (int j = 7; j >= 0; --j) { + uint8_t b = (uint8_t)(ce >> (j * 8)); + if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits + if (b == 1) { + fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer); + return FALSE; + } + if ((j == 7 || j == 3 || j == 1) && b == 2) { + fprintf(stderr, "Warning: invalid UCA weight lead byte 02 for %s\n", buffer); + return FALSE; + } + if (j == 7) { + isCompressible = builder.isCompressibleLeadByte(b); + } else if (j == 6) { + // Primary second bytes 03 and FF are compression terminators. + // 02, 03 and FF are usable when the lead byte is not compressible. + // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible. + if (isCompressible && (b <= 3 || b == 0xff)) { + fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n", + b, buffer); + return FALSE; + } } - } - // Primary second bytes 03 and FF are compression terminators. - if (!isContinuation(value) && (bytes[1] == 3 || bytes[1] == 0xFF)) { - fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n", - bytes[1], buffer); - return NULL; } } } - if(U_FAILURE(*status)) { - fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status)); - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; - } - - return element; + return TRUE; } - -void writeOutData(UCATableHeader *data, - UCAConstants *consts, - LeadByteConstants *leadByteConstants, - UChar contractions[][MAX_UCA_CONTRACTION_LENGTH], - uint32_t noOfcontractions, - const char *outputDir, - const char *copyright, - UErrorCode *status) +static void +parseFractionalUCA(const char *filename, + CollationBaseDataBuilder &builder, + UErrorCode *status) { - if(U_FAILURE(*status)) { + if(U_FAILURE(*status)) { return; } + FILE *data = fopen(filename, "r"); + if(data == NULL) { + fprintf(stderr, "Couldn't open file: %s\n", filename); + *status = U_FILE_ACCESS_ERROR; return; } + uint32_t line = 0; - uint32_t size = data->size; + UChar32 maxCodePoint = 0; + while(!feof(data)) { + if(U_FAILURE(*status)) { + fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", + *status, u_errorName(*status), (int)line, filename); + exit(*status); + } - data->UCAConsts = data->size; - data->size += paddedsize(sizeof(UCAConstants)); + line++; - if(noOfcontractions != 0) { - uprv_memset(&contractions[noOfcontractions][0], 0, MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR); - noOfcontractions++; + UnicodeString prefix; + UnicodeString s; + int64_t ces[32]; + int32_t cesLength = 0; + if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) { + // we have read the line, now do something sensible with the read data! + uint32_t p = (uint32_t)(ces[0] >> 32); + + if(s.length() > 1 && s[0] == 0xFDD0) { + // FractionalUCA.txt contractions starting with U+FDD0 + // are only entered into the inverse table, + // not into the normal collation data. + builder.addRootElements(ces, cesLength, *status); + if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) { + // Lead byte for numeric sorting. + builder.setNumericPrimary(p); + } + } else { + UChar32 c = s.char32At(0); + if(c > maxCodePoint) { maxCodePoint = c; } + + // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary. + // CollationBaseDataBuilder::init() maps them to special CEs. + // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt. + if(0xfffd <= c && c <= 0xffff) { continue; } + if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) { + continue; + } + if(0xe0000000 <= p && p < 0xf0000000) { + fprintf(stderr, + "Error: Unexpected mapping to an implicit or trailing primary" + " on line %u of %s.\n", + (int)line, filename); + exit(U_INVALID_FORMAT_ERROR); + } - data->contractionUCACombos = data->size; - data->contractionUCACombosWidth = (uint8_t)MAX_UCA_CONTRACTION_LENGTH; - data->contractionUCACombosSize = noOfcontractions; - data->size += paddedsize((noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR)); + builder.add(prefix, s, ces, cesLength, *status); + } + } } - data->scriptToLeadByte = data->size; - //printf("@@@@ script to lead byte offset = 0x%x (%d)\n", data->size, data->size); - data->size += - sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT) + // index table header - leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]) + // index table - sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET) + // data table header - leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[0]); // data table - data->leadByteToScript = data->size; - //printf("@@@@ lead byte to script offset = 0x%x (%d)\n", data->size, data->size); - data->size += - sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH) + // index table header - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[0]) + // index table - sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET) + // data table header - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[0]); // data table - - UNewDataMemory *pData; - - long dataLength; - UDataInfo ucaInfo; - uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo)); - uprv_memcpy(ucaInfo.dataVersion, UCAVersion, U_MAX_VERSION_LENGTH); - - pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo, - copyright, status); - if(U_FAILURE(*status)) { - fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status)); + int32_t numRanges = 0; + int32_t numRangeCodePoints = 0; + UChar32 rangeFirst = U_SENTINEL; + UChar32 rangeLast = U_SENTINEL; + uint32_t rangeFirstPrimary = 0; + uint32_t rangeLastPrimary = 0; + int32_t rangeStep = -1; + + // Detect ranges of characters in primary code point order, + // with 3-byte primaries and + // with consistent "step" differences between adjacent primaries. + // This relies on the FractionalUCA generator using the same primary-weight incrementation. + // Start at U+0180: No ranges for common Latin characters. + // Go one beyond maxCodePoint in case a range ends there. + for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) { + UBool action; + uint32_t p = builder.getLongPrimaryIfSingleCE(c); + if(p != 0) { + // p is a "long" (three-byte) primary. + if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) { + // Find the offset between the two primaries. + int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries( + rangeLastPrimary, p, builder.isCompressiblePrimary(p)); + if(rangeFirst == rangeLast && step >= 2) { + // c == rangeFirst + 1, store the "step" between range primaries. + rangeStep = step; + rangeLast = c; + rangeLastPrimary = p; + action = 0; // continue range + } else if(rangeStep == step) { + // Continue the range with the same "step" difference. + rangeLast = c; + rangeLastPrimary = p; + action = 0; // continue range + } else { + action = 1; // maybe finish range, start a new one + } + } else { + action = 1; // maybe finish range, start a new one + } + } else { + action = -1; // maybe finish range, do not start a new one + } + if(action != 0 && rangeFirst >= 0) { + // Finish a range. + // Set offset CE32s for a long range, leave single CEs for a short range. + UBool didSetRange = builder.maybeSetPrimaryRange( + rangeFirst, rangeLast, + rangeFirstPrimary, rangeStep, *status); + if(U_FAILURE(*status)) { + fprintf(stderr, + "failure setting code point order range U+%04lx..U+%04lx " + "%08lx..%08lx step %d - %s\n", + (long)rangeFirst, (long)rangeLast, + (long)rangeFirstPrimary, (long)rangeLastPrimary, + (int)rangeStep, u_errorName(*status)); + } else if(didSetRange) { + int32_t rangeLength = rangeLast - rangeFirst + 1; + if(beVerbose) { + printf("* set code point order range U+%04lx..U+%04lx [%d] " + "%08lx..%08lx step %d\n", + (long)rangeFirst, (long)rangeLast, + (int)rangeLength, + (long)rangeFirstPrimary, (long)rangeLastPrimary, + (int)rangeStep); + } + ++numRanges; + numRangeCodePoints += rangeLength; + } + rangeFirst = U_SENTINEL; + rangeStep = -1; + } + if(action > 0) { + // Start a new range. + rangeFirst = rangeLast = c; + rangeFirstPrimary = rangeLastPrimary = p; + } + } + printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints); + + // Idea: Probably best to work in two passes. + // Pass 1 for reading all data, setting isCompressible flags (and reordering groups) + // and finding ranges. + // Then set the ranges in a newly initialized builder + // for optimal compression (makes sure that adjacent blocks can overlap easily). + // Then set all mappings outside the ranges. + // + // In the first pass, we could store mappings in a simple list, + // with single-character/single-long-primary-CE mappings in a UTrie2; + // or store the mappings in a temporary builder; + // or we could just parse the input file again in the second pass. + // + // Ideally set/copy U+0000..U+017F before setting anything else, + // then set default Han/Hangul, then set the ranges, then copy non-range mappings. + // It should be easy to copy mappings from an un-built builder to a new one. + // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions. + + if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { + fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); + fclose(data); return; } - /* write the data to the file */ if (beVerbose) { - printf("Writing out UCA table: %s%c%s.%s\n", outputDir, - U_FILE_SEP_CHAR, - U_ICUDATA_NAME "_" UCA_DATA_NAME, - UCA_DATA_TYPE); + printf("\nLines read: %u\n", (int)line); } - udata_writeBlock(pData, data, size); - // output the constants here - udata_writeBlock(pData, consts, sizeof(UCAConstants)); + fclose(data); - if (beVerbose) { - printf("first tertiary ignorable = %x %x\n", consts->UCA_FIRST_TERTIARY_IGNORABLE[0], consts->UCA_FIRST_TERTIARY_IGNORABLE[1]); - printf("last tertiary ignorable = %x %x\n", consts->UCA_LAST_TERTIARY_IGNORABLE[0], consts->UCA_LAST_TERTIARY_IGNORABLE[1]); - printf("first secondary ignorable = %x %x\n", consts->UCA_FIRST_SECONDARY_IGNORABLE[0], consts->UCA_FIRST_SECONDARY_IGNORABLE[1]); - printf("contractionUCACombosSize = %d\n", data->contractionUCACombosSize); - printf("contractionSize = %d\n", data->contractionSize); - printf("number of UCA contractions = %d\n", noOfcontractions); + return; +} + +static void +buildAndWriteBaseData(CollationBaseDataBuilder &builder, + const char *path, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + + if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) { + fprintf(stderr, "error: unexpected [fixed secondary common byte]"); + errorCode = U_INVALID_FORMAT_ERROR; + return; } - - if(noOfcontractions != 0) { - udata_writeBlock(pData, contractions, noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR); - udata_writePadding(pData, paddedsize((noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR)) - noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR); + if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) { + fprintf(stderr, "error: unexpected [fixed tertiary common byte]"); + errorCode = U_INVALID_FORMAT_ERROR; + return; } - // output the script to lead bytes table here - if (beVerbose) { - printf("Writing Script to Lead Byte Data\n"); - printf("\tindex table size = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT); - printf("\tdata block size = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET); + if(leadByteScripts != NULL) { + uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1; + do { + // Find the range of lead bytes with this set of scripts. + const UnicodeString &firstScripts = leadByteScripts[firstLead]; + if(firstScripts.isEmpty()) { + fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead); + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + uint32_t lead = firstLead; + for(;;) { + ++lead; + const UnicodeString &scripts = leadByteScripts[lead]; + // The scripts should either be the same or disjoint. + // We do not test if all reordering groups have disjoint sets of scripts. + if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; } + if(scripts != firstScripts) { + fprintf(stderr, + "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] " + "but not all scripts match\n", + (int)firstLead, scripts[0], (int)lead); + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + } + // lead is one greater than the last lead byte with the same set of scripts as firstLead. + builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode); + if(U_FAILURE(errorCode)) { return; } + firstLead = lead; + } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE); + delete[] leadByteScripts; + } + + CollationData data(*Normalizer2Factory::getNFCImpl(errorCode)); + builder.enableFastLatin(); + builder.build(data, errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "builder.build() failed: %s\n", + u_errorName(errorCode)); + return; } - udata_write16(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT); - udata_write16(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET); -// printf("#### Script to Lead Byte Index Before Sort\n"); -// for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) { -// printf("\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset); -// } - qsort(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT, sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]), ReorderIndexComparer); - udata_writeBlock(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0])); -// printf("#### Script to Lead Byte Index After Sort\n"); -// for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) { -// printf("\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset); -// } - - // write out the script to lead bytes data block - udata_writeBlock(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof(*leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA)); - - if (beVerbose) { - printf("Writing Lead Byte To Script Data\n"); - printf("\tindex table size = %x\n", leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH); - printf("\tdata block size = %x\n", leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET); + + // The CollationSettings constructor gives us the properly encoded + // default options, so that we need not duplicate them here. + CollationSettings settings; + + UVector32 rootElements(errorCode); + for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) { + rootElements.addElement(0, errorCode); } - // output the header info - udata_write16(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH); - udata_write16(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET); - - // output the index table - udata_writeBlock(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX, - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX)[0]); -// for (int leadByte = 0; leadByte < leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH; leadByte++) { -// printf("\t%02x = %04x\n", leadByte, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]); -// } - - // output the data - udata_writeBlock(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA, - leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof(*leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA)); - - - /* finish up */ - dataLength=udata_finish(pData, status); - if(U_FAILURE(*status)) { - fprintf(stderr, "Error: error %d writing the output file\n", *status); + builder.buildRootElementsTable(rootElements, errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n", + u_errorName(errorCode)); return; } -} + int32_t index = CollationRootElements::IX_COUNT; + rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX); -enum { - /* - * Maximum number of UCA contractions we can store. - * May need to be increased for a new Unicode version. - */ - MAX_UCA_CONTRACTIONS=2048 -}; + while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; } + rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX); -static int32_t -write_uca_table(const char *filename, - const char *outputDir, - const char *copyright, - UErrorCode *status) -{ - FILE *data = fopen(filename, "r"); - if(data == NULL) { - fprintf(stderr, "Couldn't open file: %s\n", filename); - return -1; + while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { + ++index; } - uint32_t line = 0; - UCAElements *element = NULL; - UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); - /* test for NULL */ - if(myD == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - fclose(data); - return 0; + rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX); + + rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE, + CollationRootElements::IX_COMMON_SEC_AND_TER_CE); + + int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24; + secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16; + secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte"); + rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES); + + LocalMemory buffer; + int32_t capacity = 1000000; + uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); + if(dest == NULL) { + fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", + (long)capacity); + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; } - uprv_memset(myD, 0, sizeof(UCATableHeader)); - UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); - /* test for NULL */ - if(opts == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - uprv_free(myD); - fclose(data); - return 0; + int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; + int32_t totalSize = CollationDataWriter::writeBase( + data, settings, + rootElements.getBuffer(), rootElements.size(), + indexes, dest, capacity, + errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n", + (long)capacity, u_errorName(errorCode)); + return; } - uprv_memset(opts, 0, sizeof(UColOptionSet)); - UChar contractions[MAX_UCA_CONTRACTIONS][MAX_UCA_CONTRACTION_LENGTH]; - uprv_memset(contractions, 0, sizeof(contractions)); - uint32_t noOfContractions = 0; - UCAConstants consts; - uprv_memset(&consts, 0, sizeof(consts)); -#if 0 - UCAConstants consts = { - UCOL_RESET_TOP_VALUE, - UCOL_FIRST_PRIMARY_IGNORABLE, - UCOL_LAST_PRIMARY_IGNORABLE, - UCOL_LAST_PRIMARY_IGNORABLE_CONT, - UCOL_FIRST_SECONDARY_IGNORABLE, - UCOL_LAST_SECONDARY_IGNORABLE, - UCOL_FIRST_TERTIARY_IGNORABLE, - UCOL_LAST_TERTIARY_IGNORABLE, - UCOL_FIRST_VARIABLE, - UCOL_LAST_VARIABLE, - UCOL_FIRST_NON_VARIABLE, - UCOL_LAST_NON_VARIABLE, - - UCOL_NEXT_TOP_VALUE, -/* - UCOL_NEXT_FIRST_PRIMARY_IGNORABLE, - UCOL_NEXT_LAST_PRIMARY_IGNORABLE, - UCOL_NEXT_FIRST_SECONDARY_IGNORABLE, - UCOL_NEXT_LAST_SECONDARY_IGNORABLE, - UCOL_NEXT_FIRST_TERTIARY_IGNORABLE, - UCOL_NEXT_LAST_TERTIARY_IGNORABLE, - UCOL_NEXT_FIRST_VARIABLE, - UCOL_NEXT_LAST_VARIABLE, -*/ - - PRIMARY_IMPLICIT_MIN, - PRIMARY_IMPLICIT_MAX - }; -#endif - - //printf("Allocating LeadByteConstants\n"); - LeadByteConstants leadByteConstants; - uprv_memset(&leadByteConstants, 0x00, sizeof(LeadByteConstants)); - - leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH = 256; - leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX = (ReorderIndex*) uprv_malloc(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof(ReorderIndex)); - uprv_memset(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX, 0x00, leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof(ReorderIndex)); - leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH = 1024; - leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA = (uint16_t*) uprv_malloc(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof(uint16_t)); - uprv_memset(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA, 0x00, leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof(uint16_t)); - //printf("\tFinished Allocating LeadByteConstants\n"); - - leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH = 256; - leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX = (uint16_t*) uprv_malloc(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(uint16_t)); - uprv_memset(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX, 0x8000 | USCRIPT_INVALID_CODE, leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(uint16_t)); - leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH = 1024; - leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET = 1; // offset by 1 to leave zero location for those lead bytes with no reorder codes - leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA = (uint16_t*) uprv_malloc(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof(uint16_t)); - uprv_memset(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA, 0x00, leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof(uint16_t)); - - uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF); - - opts->variableTopValue = 0; - opts->strength = UCOL_TERTIARY; - opts->frenchCollation = UCOL_OFF; - opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/ - opts->caseFirst = UCOL_OFF; /* who goes first, lower case or uppercase */ - opts->caseLevel = UCOL_OFF; /* do we have an extra case level */ - opts->normalizationMode = UCOL_OFF; /* attribute for normalization */ - opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */ - opts->numericCollation = UCOL_OFF; - myD->jamoSpecial = FALSE; - - tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status); - if(U_FAILURE(*status)) - { - fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status)); - uprv_free(opts); - uprv_free(myD); - fclose(data); - return -1; + printf("*** CLDR root collation part sizes ***\n"); + CollationInfo::printSizes(totalSize, indexes); + printf("*** CLDR root collation size: %6ld (with file header but no copyright string)\n", + (long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary + + CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion); + UNewDataMemory *pData=udata_create(path, "icu", "ucadata", &ucaDataInfo, + withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n", + path, u_errorName(errorCode)); + return; } - // * set to zero - struct { - UChar32 start; - UChar32 end; - int32_t value; - } ranges[] = - { - {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) }, //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/ - //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, already set in utrie_open() /* D800-DBFF*/ - {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) }, //2 TRAIL_SURROGATE DC00-DFFF - // Now directly handled in the collation code by the swapCJK function. - //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/ - //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/ - //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/ - //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/ - //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/ - }; - uint32_t i = 0; - - for(i = 0; imapping, ranges[i].start, ranges[i].end, ranges[i].value); */ - utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE); + udata_writeBlock(pData, dest, totalSize); + long dataLength = udata_finish(pData, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode)); + return; } + if(dataLength != (long)totalSize) { + fprintf(stderr, + "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n", + dataLength, (long)totalSize); + errorCode=U_INTERNAL_PROGRAM_ERROR; + } +} - int32_t surrogateCount = 0; - while(!feof(data)) { - if(U_FAILURE(*status)) { - fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", - *status, u_errorName(*status), (int)line, filename); - exit(*status); - } - - line++; - if(beVerbose) { - printf("%u ", (int)line); +/** + * Adds each lead surrogate to the bmp set if any of the 1024 + * associated supplementary code points is in the supp set. + * These can be one and the same set. + */ +static void +setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) { + UChar32 c = 0x10000; + for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { + if(supp.containsSome(c, c + 0x3ff)) { + bmp.add(lead); } - element = readAnElement(data, t, &consts, &leadByteConstants, status); - if(element != NULL) { - // we have read the line, now do something sensible with the read data! + } +} - // if element is a contraction, we want to add it to contractions[] - int32_t length = (int32_t)element->cSize; - if(length > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction - if(U16_IS_LEAD(element->cPoints[0]) && U16_IS_TRAIL(element->cPoints[1]) && length == 2) { - surrogateCount++; - } else { - if(noOfContractions>=MAX_UCA_CONTRACTIONS) { - fprintf(stderr, - "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. " - "Exiting...\n", - (int)MAX_UCA_CONTRACTIONS); - exit(U_BUFFER_OVERFLOW_ERROR); - } - if(length > MAX_UCA_CONTRACTION_LENGTH) { - fprintf(stderr, - "\nLine %d: Contraction of length %d is too long. Please increase MAX_UCA_CONTRACTION_LENGTH in genuca.cpp. " - "Exiting...\n", - (int)line, (int)length); - exit(U_BUFFER_OVERFLOW_ERROR); - } - UChar *t = &contractions[noOfContractions][0]; - u_memcpy(t, element->cPoints, length); - t += length; - for(; length < MAX_UCA_CONTRACTION_LENGTH; ++length) { - *t++ = 0; +static int32_t +makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256], + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + bits[0] = 0; // no bits set + bits[1] = 0xffffffff; // all bits set + int32_t bitsLength = 2; + int32_t i = 0; + for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) { + if(set.containsNone(c, c + 0x1f)) { + index[i] = 0; + } else if(set.contains(c, c + 0x1f)) { + index[i] = 1; + } else { + uint32_t b = 0; + for(int32_t j = 0; j <= 0x1f; ++j) { + if(set.contains(c + j)) { + b |= (uint32_t)1 << j; } - noOfContractions++; - } } - else { - // TODO (claireho): does this work? Need more tests - // The following code is to handle the UCA pre-context rules - // for L/l with middle dot. We share the structures for contractionCombos. - // The format for pre-context character is - // contractions[0]: codepoint in element->cPoints[0] - // contractions[1]: '\0' to differentiate from a contraction - // contractions[2]: prefix char - if (element->prefixSize>0) { - if(length > 1 || element->prefixSize > 1) { - fprintf(stderr, - "\nLine %d: Character with prefix, " - "either too many characters or prefix too long.\n", - (int)line); - exit(U_INTERNAL_PROGRAM_ERROR); - } - if(noOfContractions>=MAX_UCA_CONTRACTIONS) { - fprintf(stderr, - "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. " - "Exiting...\n", - (int)MAX_UCA_CONTRACTIONS); - exit(U_BUFFER_OVERFLOW_ERROR); - } - UChar *t = &contractions[noOfContractions][0]; - t[0]=element->cPoints[0]; - t[1]=0; - t[2]=element->prefixChars[0]; - t += 3; - for(length = 3; length < MAX_UCA_CONTRACTION_LENGTH; ++length) { - *t++ = 0; + int32_t k; + for(k = 2;; ++k) { + if(k == bitsLength) { + // new bit combination + if(bitsLength == 256) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return 0; } - noOfContractions++; + bits[bitsLength++] = b; + break; + } + if(bits[k] == b) { + // duplicate bit combination + break; } } - - /* we're first adding to inverse, because addAnElement will reverse the order */ - /* of code points and stuff... we don't want that to happen */ - if((element->CEs[0] >> 24) != 2) { - // Add every element except for the special minimum-weight character U+FFFE - // which has 02 weights. - // If we had 02 weights in the invuca table, then tailoring primary - // after an ignorable would try to put a weight before 02 which is not valid. - // We could fix this in a complicated way in the from-rule-string builder, - // but omitting this special element from invuca is simple and effective. - addToInverse(element, status); - } - if(!(length > 1 && element->cPoints[0] == 0xFDD0)) { - uprv_uca_addAnElement(t, element, status); - } + index[i] = k; } } + return bitsLength; +} - if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { - fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); - uprv_uca_closeTempTable(t); - uprv_free(opts); - uprv_free(myD); - fclose(data); - return -1; - } -/* { - uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL); - }*/ - - if (beVerbose) { - printf("\nLines read: %u\n", (int)line); - printf("Surrogate count: %i\n", (int)surrogateCount); - printf("Raw data breakdown:\n"); - /*printf("Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ - printf("Number of contractions: %u\n", (int)noOfContractions); - printf("Contraction image size: %u\n", (int)t->image->contractionSize); - printf("Expansions size: %i\n", (int)t->expansions->position); - } - - - /* produce canonical closure for table */ - /* first set up constants for implicit calculation */ - uprv_uca_initImplicitConstants(status); - /* do the closure */ - UnicodeSet closed; - int32_t noOfClosures = uprv_uca_canonicalClosure(t, NULL, &closed, status); - if(noOfClosures != 0) { - fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures); - UnicodeString pattern; - std::string utf8; - closed.toPattern(pattern, TRUE).toUTF8String(utf8); - fprintf(stderr, "UTF-8 pattern string: %s\n", utf8.c_str()); - } - - /* test */ - UCATableHeader *myData = uprv_uca_assembleTable(t, status); - - if (beVerbose) { - printf("Compacted data breakdown:\n"); - /*printf("Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/ - printf("Number of contractions: %u\n", (int)noOfContractions); - printf("Contraction image size: %u\n", (int)t->image->contractionSize); - printf("Expansions size: %i\n", (int)t->expansions->position); - } +// TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values, +// use that rather than properties APIs. +// Then consider moving related logic for the unsafeBwdSet back from the loader into this builder. - if(U_FAILURE(*status)) { - fprintf(stderr, "Error creating table: %s\n", u_errorName(*status)); - uprv_uca_closeTempTable(t); - uprv_free(opts); - uprv_free(myD); - fclose(data); - return -1; +/** + * Builds data for the FCD check fast path. + * For details see the CollationFCD class comments. + */ +static void +buildAndWriteFCDData(const char *path, UErrorCode &errorCode) { + UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode); + UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode); + if(U_FAILURE(errorCode)) { return; } + setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet); + // The following supp(lccc)->lead(tccc) should be unnecessary + // after the previous supp(tccc)->lead(tccc) + // because there should not be any characters with lccc!=0 and tccc=0. + // It is safe and harmless. + setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet); + setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet); + uint8_t lcccIndex[0x800], tcccIndex[0x800]; + uint32_t lcccBits[256], tcccBits[256]; + int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode); + int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode); + printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4); + printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4); + + if(U_FAILURE(errorCode)) { return; } + + FILE *f=usrc_create(path, "collationfcd.cpp", + "icu/tools/unicode/c/genuca/genuca.cpp"); + if(f==NULL) { + errorCode=U_FILE_ACCESS_ERROR; + return; } + fputs("#include \"unicode/utypes.h\"\n\n", f); + fputs("#if !UCONFIG_NO_COLLATION\n\n", f); + fputs("#include \"collationfcd.h\"\n\n", f); + fputs("U_NAMESPACE_BEGIN\n\n", f); + usrc_writeArray(f, + "const uint8_t CollationFCD::lcccIndex[%ld]={\n", + lcccIndex, 8, 0x800, + "\n};\n\n"); + usrc_writeArray(f, + "const uint32_t CollationFCD::lcccBits[%ld]={\n", + lcccBits, 32, lcccBitsLength, + "\n};\n\n"); + usrc_writeArray(f, + "const uint8_t CollationFCD::tcccIndex[%ld]={\n", + tcccIndex, 8, 0x800, + "\n};\n\n"); + usrc_writeArray(f, + "const uint32_t CollationFCD::tcccBits[%ld]={\n", + tcccBits, 32, tcccBitsLength, + "\n};\n\n"); + fputs("U_NAMESPACE_END\n\n", f); + fputs("#endif // !UCONFIG_NO_COLLATION\n", f); + fclose(f); +} - /* populate the version info struct with version info*/ - myData->version[0] = UCOL_BUILDER_VERSION; - myData->version[1] = UCAVersion[0]; - myData->version[2] = UCAVersion[1]; - myData->version[3] = UCAVersion[2]; - /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/ - // Removed this macro. Instead, we use the fields below - //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION; - //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt - uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo)); - u_getUnicodeVersion(myData->UCDVersion); - - writeOutData(myData, &consts, &leadByteConstants, contractions, noOfContractions, outputDir, copyright, status); - - InverseUCATableHeader *inverse = assembleInverseTable(status); - uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo)); - writeOutInverseData(inverse, outputDir, copyright, status); - - uprv_uca_closeTempTable(t); - uprv_free(myD); - uprv_free(opts); - - uprv_free(myData); - uprv_free(inverse); - - uprv_free(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX); - uprv_free(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA); - uprv_free(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX); - uprv_free(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA); - - fclose(data); - - return 0; +static void +parseAndWriteCollationRootData( + const char *fracUCAPath, + const char *binaryDataPath, + const char *sourceCodePath, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + CollationBaseDataBuilder builder(errorCode); + builder.init(errorCode); + parseFractionalUCA(fracUCAPath, builder, &errorCode); + buildAndWriteBaseData(builder, binaryDataPath, errorCode); + buildAndWriteFCDData(sourceCodePath, errorCode); } -#endif /* #if !UCONFIG_NO_COLLATION */ +// ------------------------------------------------------------------------- *** enum { HELP_H, HELP_QUESTION_MARK, - COPYRIGHT, - VERSION, VERBOSE, - ICUDATADIR + COPYRIGHT }; -/* Keep these values in sync with the above enums */ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, - UOPTION_COPYRIGHT, - UOPTION_VERSION, UOPTION_VERBOSE, - UOPTION_ICUDATADIR + UOPTION_COPYRIGHT }; -int main(int argc, char* argv[]) { - uprv_memset(&UCAVersion, 0, 4); - +extern "C" int +main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); + argc=u_parseArgs(argc, argv, LENGTHOF(options), options); /* error handling, printing usage message */ @@ -1525,108 +1045,61 @@ int main(int argc, char* argv[]) { "error in command line argument \"%s\"\n", argv[-argc]); } - if(argc<2 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { + if( argc<2 || + options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur + ) { + /* + * Broken into chunks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [-options] path/to/ICU/src/root\n" + "\n" + "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n" + "writes source and binary data files with the collation root data.\n" + "\n", + argv[0]); fprintf(stderr, - "usage: %s [-options] path/to/ICU/src/root\n" - "\tRead in UCA collation text data and write out the binary collation data\n" - "options:\n" + "Options:\n" "\t-h or -? or --help this usage text\n" - "\t-V or --version show a version message\n" - "\t-c or --copyright include a copyright notice\n" - "\t-v or --verbose turn on verbose output\n" - "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" - "\t followed by path, defaults to %s\n", - argv[0], u_getDataDirectory()); - return argc<2 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; - } - if(options[VERSION].doesOccur) { - printf("genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n", -#if UCONFIG_NO_COLLATION - 0, 0 -#else - UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1 -#endif - ); - printf(U_COPYRIGHT_STRING"\n"); - exit(0); + "\t-v or --verbose verbose output\n" + "\t-c or --copyright include a copyright notice\n"); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } - /* get the options values */ - beVerbose = options[VERBOSE].doesOccur; - - const char *copyright = NULL; - if (options[COPYRIGHT].doesOccur) { - copyright = U_COPYRIGHT_STRING; - } + beVerbose=options[VERBOSE].doesOccur; + withCopyright=options[COPYRIGHT].doesOccur; - if (options[ICUDATADIR].doesOccur) { - u_setDataDirectory(options[ICUDATADIR].value); - } - /* Initialize ICU */ IcuToolErrorCode errorCode("genuca"); - u_init(errorCode); - if (errorCode.isFailure() && errorCode.get() != U_FILE_ACCESS_ERROR) { - fprintf(stderr, "%s: can not initialize ICU. status = %s\n", - argv[0], errorCode.errorName()); - exit(errorCode.reset()); - } - errorCode.reset(); CharString icuSrcRoot(argv[1], errorCode); - CharString icuSourceData(icuSrcRoot, errorCode); - icuSourceData.appendPathPart("source", errorCode); - icuSourceData.appendPathPart("data", errorCode); - - CharString srcDir(icuSourceData, errorCode); - srcDir.appendPathPart("unidata", errorCode); - - CharString destDir(icuSourceData, errorCode); - destDir.appendPathPart("in", errorCode); - destDir.appendPathPart("coll", errorCode); + CharString icuSource(icuSrcRoot, errorCode); + icuSource.appendPathPart("source", errorCode); - CharString ucaFile(srcDir, errorCode); - ucaFile.appendPathPart("FractionalUCA.txt", errorCode); + CharString icuSourceData(icuSource, errorCode); + icuSourceData.appendPathPart("data", errorCode); - if(errorCode.isFailure()) { - fprintf(stderr, "genuca: unable to build file paths - %s\n", - errorCode.errorName()); - return errorCode.reset(); - } + CharString fracUCAPath(icuSourceData, errorCode); + fracUCAPath.appendPathPart("unidata", errorCode); + fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode); -#if UCONFIG_NO_COLLATION + CharString sourceDataInColl(icuSourceData, errorCode); + sourceDataInColl.appendPathPart("in", errorCode); + sourceDataInColl.appendPathPart("coll", errorCode); - UNewDataMemory *pData; - const char *msg; - - msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; - fprintf(stderr, "%s\n", msg); - pData = udata_create(destDir.data(), UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo, - NULL, errorCode); - udata_writeBlock(pData, msg, strlen(msg)); - udata_finish(pData, errorCode); - - msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h"; - fprintf(stderr, "%s\n", msg); - pData = udata_create(destDir.data(), INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo, - NULL, errorCode); - udata_writeBlock(pData, msg, strlen(msg)); - udata_finish(pData, errorCode); - - return errorCode.reset(); + CharString sourceI18n(icuSource, errorCode); + sourceI18n.appendPathPart("i18n", errorCode); -#else + errorCode.assertSuccess(); - return write_uca_table(ucaFile.data(), destDir.data(), copyright, errorCode); + parseAndWriteCollationRootData( + fracUCAPath.data(), + sourceDataInColl.data(), + sourceI18n.data(), + errorCode); -#endif + return errorCode; } -/* - * Hey, Emacs, please set the following: - * - * Local Variables: - * indent-tabs-mode: nil - * End: - * - */ +#endif // UCONFIG_NO_COLLATION diff --git a/tools/unicode/c/genuca/genuca.h b/tools/unicode/c/genuca/genuca.h deleted file mode 100644 index a98e6699428..00000000000 --- a/tools/unicode/c/genuca/genuca.h +++ /dev/null @@ -1,47 +0,0 @@ -/* -******************************************************************************* -* -* Copyright (C) 2000-2004, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: genuca.h -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created at the end of XX century -* created by: Vladimir Weinstein -* -* This program reads the Franctional UCA table and generates -* internal format for UCA table as well as inverse UCA table. -* It then writes binary files containing the data: ucadata.dat -* & invuca.dat -*/ - -#ifndef UCADATA_H -#define UCADATA_H - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_COLLATION - -#include "ucol_elm.h" -#include -#include -#include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "ucol_imp.h" -#include "uhash.h" -#include "unewdata.h" - - -void deleteElement(void *element); -int32_t readElement(char **from, char *to, char separator, UErrorCode *status); -uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status); -void printOutTable(UCATableHeader *myData, UErrorCode *status); -UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status); - -#endif /* #if !UCONFIG_NO_COLLATION */ - -#endif diff --git a/tools/unicode/c/genuca/genuca.vcproj b/tools/unicode/c/genuca/genuca.vcproj index 0056316e727..9c6919350ae 100644 --- a/tools/unicode/c/genuca/genuca.vcproj +++ b/tools/unicode/c/genuca/genuca.vcproj @@ -402,10 +402,6 @@ Name="Header Files" Filter="h;hpp;hxx;hm;inl" > - -