OUTPUT_FILENAME,
UNICODE_VERSION,
WRITE_C_SOURCE,
+ WRITE_COMBINED_DATA,
OPT_FAST
};
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
+ UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
};
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
- /*
- * Broken into chunks because the C89 standard says the minimum
- * required supported string length is 509 bytes.
- */
fprintf(stderr,
"Usage: %s [-options] infiles+ -o outputfilename\n"
"\n"
"Reads the infiles with normalization data and\n"
- "creates a binary or C source file (outputfilename) with the data.\n"
+ "creates a binary file, or a C source file (--csource), with the data,\n"
+ "or writes a data file with the combined data (--combined).\n"
+ "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
+ "\n"
+ "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
+ "\n"
+ "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
+ "in input-file syntax to the outputfilename.\n"
+ "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
+ "(Useful for computing minimal incremental mapping data files.)\n"
"\n",
- argv[0]);
+ argv[0], argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
fprintf(stderr,
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-o or --output output filename\n"
- "\t --csource writes a C source file with initializers\n");
+ "\t --csource writes a C source file with initializers\n"
+ "\t --combined writes a .txt file (input-file syntax) with the\n"
+ "\t combined data from all of the input files\n");
fprintf(stderr,
"\t --fast optimize the data for fast normalization,\n"
"\t which might increase its size (Writes fully decomposed\n"
#else
- LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
+ LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
+ LocalPointer<Normalizer2DataBuilder> b2;
+ LocalPointer<Normalizer2DataBuilder> diff;
+ Normalizer2DataBuilder *builder = b1.getAlias();
errorCode.assertSuccess();
if(options[UNICODE_VERSION].doesOccur) {
pathLength=filename.length();
}
+ bool doMinus = false;
for(int i=1; i<argc; ++i) {
printf("gennorm2: processing %s\n", argv[i]);
+ if(strcmp(argv[i], "minus") == 0) {
+ if(doMinus) {
+ fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ // Data from previous input files has been collected in b1.
+ // Collect data from further input files in b2.
+ b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+ diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+ errorCode.assertSuccess();
+ builder = b2.getAlias();
+ if(options[UNICODE_VERSION].doesOccur) {
+ builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+ }
+ if(options[OPT_FAST].doesOccur) {
+ builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
+ }
+ doMinus = true;
+ continue;
+ }
filename.append(argv[i], errorCode);
LocalStdioFilePointer f(fopen(filename.data(), "r"));
if(f==NULL) {
filename.truncate(pathLength);
}
- if(options[WRITE_C_SOURCE].doesOccur) {
+ if(doMinus) {
+ Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
+ diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
+ } else if(options[WRITE_COMBINED_DATA].doesOccur) {
+ builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
+ } else if(options[WRITE_C_SOURCE].doesOccur) {
builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
} else {
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/udata.h"
+#include "unicode/uniset.h"
#include "unicode/unistr.h"
+#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "charstr.h"
#include "extradata.h"
void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
norms.createNorm(c)->cc=cc;
+ norms.ccSet.add(c);
}
static UBool isWellFormed(const UnicodeString &s) {
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ONE_WAY;
p->setMappingCP();
+ norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ROUND_TRIP;
p->mappingCP=U_SENTINEL;
+ norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mappingType=Norm::REMOVED;
+ norms.mappingSet.add(c);
}
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
fclose(f);
}
+namespace {
+
+bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
+ if(s1 == nullptr) {
+ return s2 == nullptr;
+ } else if(s2 == nullptr) {
+ return false;
+ } else {
+ return *s1 == *s2;
+ }
+}
+
+const char *typeChars = "?-=>";
+
+void writeMapping(FILE *f, const UnicodeString *m) {
+ if(m != nullptr && !m->isEmpty()) {
+ int32_t i = 0;
+ UChar32 c = m->char32At(i);
+ fprintf(f, "%04lX", (long)c);
+ while((i += U16_LENGTH(c)) < m->length()) {
+ c = m->char32At(i);
+ fprintf(f, " %04lX", (long)c);
+ }
+ }
+ fputs("\n", f);
+}
+
+} // namespace
+
+void
+Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
+ // Do not processData() before writing the input-syntax data file.
+ FILE *f = fopen(filename, "w");
+ if(f == nullptr) {
+ fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
+ filename);
+ exit(U_FILE_ACCESS_ERROR);
+ return;
+ }
+
+ if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
+ unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
+ char uv[U_MAX_VERSION_STRING_LENGTH];
+ u_versionToString(unicodeVersion, uv);
+ fprintf(f, "* Unicode %s\n\n", uv);
+ }
+
+ UnicodeSetIterator ccIter(norms.ccSet);
+ UChar32 start = U_SENTINEL;
+ UChar32 end = U_SENTINEL;
+ uint8_t prevCC = 0;
+ bool done = false;
+ bool didWrite = false;
+ do {
+ UChar32 c;
+ uint8_t cc;
+ if(ccIter.next() && !ccIter.isString()) {
+ c = ccIter.getCodepoint();
+ cc = norms.getCC(c);
+ } else {
+ c = 0x110000;
+ cc = 0;
+ done = true;
+ }
+ if(cc == prevCC && c == (end + 1)) {
+ end = c;
+ } else {
+ if(prevCC != 0) {
+ if(start == end) {
+ fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
+ } else {
+ fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
+ }
+ didWrite = true;
+ }
+ start = end = c;
+ prevCC = cc;
+ }
+ } while(!done);
+ if(didWrite) {
+ fputs("\n", f);
+ }
+
+ UnicodeSetIterator mIter(norms.mappingSet);
+ start = U_SENTINEL;
+ end = U_SENTINEL;
+ const UnicodeString *prevMapping = nullptr;
+ Norm::MappingType prevType = Norm::NONE;
+ done = false;
+ do {
+ UChar32 c;
+ const Norm *norm;
+ if(mIter.next() && !mIter.isString()) {
+ c = mIter.getCodepoint();
+ norm = norms.getNorm(c);
+ } else {
+ c = 0x110000;
+ norm = nullptr;
+ done = true;
+ }
+ const UnicodeString *mapping;
+ Norm::MappingType type;
+ if(norm == nullptr) {
+ mapping = nullptr;
+ type = Norm::NONE;
+ } else {
+ type = norm->mappingType;
+ if(type == Norm::NONE) {
+ mapping = nullptr;
+ } else {
+ mapping = norm->mapping;
+ }
+ }
+ if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
+ end = c;
+ } else {
+ if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
+ if(start == end) {
+ fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
+ } else {
+ fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
+ }
+ writeMapping(f, prevMapping);
+ }
+ start = end = c;
+ prevMapping = mapping;
+ prevType = type;
+ }
+ } while(!done);
+
+ fclose(f);
+}
+
+void
+Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
+ const Normalizer2DataBuilder &b2,
+ Normalizer2DataBuilder &diff) {
+ // Compute diff = b1 - b2
+ // so that we should be able to get b1 = b2 + diff.
+ if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
+ memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
+ }
+
+ UnicodeSet ccSet(b1.norms.ccSet);
+ ccSet.addAll(b2.norms.ccSet);
+ UnicodeSetIterator ccIter(ccSet);
+ while(ccIter.next() && !ccIter.isString()) {
+ UChar32 c = ccIter.getCodepoint();
+ uint8_t cc1 = b1.norms.getCC(c);
+ uint8_t cc2 = b2.norms.getCC(c);
+ if(cc1 != cc2) {
+ diff.setCC(c, cc1);
+ }
+ }
+
+ UnicodeSet mSet(b1.norms.mappingSet);
+ mSet.addAll(b2.norms.mappingSet);
+ UnicodeSetIterator mIter(mSet);
+ while(mIter.next() && !mIter.isString()) {
+ UChar32 c = mIter.getCodepoint();
+ const Norm *norm1 = b1.norms.getNorm(c);
+ const Norm *norm2 = b2.norms.getNorm(c);
+ const UnicodeString *mapping1;
+ Norm::MappingType type1;
+ if(norm1 == nullptr || !norm1->hasMapping()) {
+ mapping1 = nullptr;
+ type1 = Norm::NONE;
+ } else {
+ mapping1 = norm1->mapping;
+ type1 = norm1->mappingType;
+ }
+ const UnicodeString *mapping2;
+ Norm::MappingType type2;
+ if(norm2 == nullptr || !norm2->hasMapping()) {
+ mapping2 = nullptr;
+ type2 = Norm::NONE;
+ } else {
+ mapping2 = norm2->mapping;
+ type2 = norm2->mappingType;
+ }
+ if(type1 == type2 && equalStrings(mapping1, mapping2)) {
+ // Nothing to do.
+ } else if(type1 == Norm::NONE) {
+ diff.removeMapping(c);
+ } else if(type1 == Norm::ROUND_TRIP) {
+ diff.setRoundTripMapping(c, *mapping1);
+ } else if(type1 == Norm::ONE_WAY) {
+ diff.setOneWayMapping(c, *mapping1);
+ }
+ }
+}
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */