From: Markus Scherer Date: Wed, 23 Aug 2017 23:33:47 +0000 (+0000) Subject: ICU-13326 gennorm2 --combined option to write the combined data of the input files... X-Git-Tag: release-60-rc~175 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d287dbbe3042067b8690739c913c920f00cf90ef;p=icu ICU-13326 gennorm2 --combined option to write the combined data of the input files; and gennorm2 minus operator to write the diffs of the combined data from two sets of input files X-SVN-Rev: 40349 --- diff --git a/icu4c/source/tools/gennorm2/gennorm2.cpp b/icu4c/source/tools/gennorm2/gennorm2.cpp index 477bfd62ad5..69d0a38c235 100644 --- a/icu4c/source/tools/gennorm2/gennorm2.cpp +++ b/icu4c/source/tools/gennorm2/gennorm2.cpp @@ -61,6 +61,7 @@ enum { OUTPUT_FILENAME, UNICODE_VERSION, WRITE_C_SOURCE, + WRITE_COMBINED_DATA, OPT_FAST }; @@ -73,6 +74,7 @@ static UOption options[]={ UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), UOPTION_DEF("csource", '\1', UOPT_NO_ARG), + UOPTION_DEF("combined", '\1', UOPT_NO_ARG), UOPTION_DEF("fast", '\1', UOPT_NO_ARG) }; @@ -96,17 +98,22 @@ main(int argc, char* argv[]) { if( argc<2 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur ) { - /* - * Broken into chunks because the C89 standard says the minimum - * required supported string length is 509 bytes. - */ fprintf(stderr, "Usage: %s [-options] infiles+ -o outputfilename\n" "\n" "Reads the infiles with normalization data and\n" - "creates a binary or C source file (outputfilename) with the data.\n" + "creates a binary file, or a C source file (--csource), with the data,\n" + "or writes a data file with the combined data (--combined).\n" + "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n" + "\n" + "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" + "\n" + "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" + "in input-file syntax to the outputfilename.\n" + "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" + "(Useful for computing minimal incremental mapping data files.)\n" "\n", - argv[0]); + argv[0], argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" @@ -116,7 +123,9 @@ main(int argc, char* argv[]) { fprintf(stderr, "\t-s or --sourcedir source directory, followed by the path\n" "\t-o or --output output filename\n" - "\t --csource writes a C source file with initializers\n"); + "\t --csource writes a C source file with initializers\n" + "\t --combined writes a .txt file (input-file syntax) with the\n" + "\t combined data from all of the input files\n"); fprintf(stderr, "\t --fast optimize the data for fast normalization,\n" "\t which might increase its size (Writes fully decomposed\n" @@ -144,7 +153,10 @@ main(int argc, char* argv[]) { #else - LocalPointer builder(new Normalizer2DataBuilder(errorCode), errorCode); + LocalPointer b1(new Normalizer2DataBuilder(errorCode), errorCode); + LocalPointer b2; + LocalPointer diff; + Normalizer2DataBuilder *builder = b1.getAlias(); errorCode.assertSuccess(); if(options[UNICODE_VERSION].doesOccur) { @@ -166,8 +178,29 @@ main(int argc, char* argv[]) { pathLength=filename.length(); } + bool doMinus = false; for(int i=1; isetUnicodeVersion(options[UNICODE_VERSION].value); + } + if(options[OPT_FAST].doesOccur) { + builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); + } + doMinus = true; + continue; + } filename.append(argv[i], errorCode); LocalStdioFilePointer f(fopen(filename.data(), "r")); if(f==NULL) { @@ -179,7 +212,12 @@ main(int argc, char* argv[]) { filename.truncate(pathLength); } - if(options[WRITE_C_SOURCE].doesOccur) { + if(doMinus) { + Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff); + diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true); + } else if(options[WRITE_COMBINED_DATA].doesOccur) { + builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false); + } else if(options[WRITE_C_SOURCE].doesOccur) { builder->writeCSourceFile(options[OUTPUT_FILENAME].value); } else { builder->writeBinaryFile(options[OUTPUT_FILENAME].value); diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp index 9bad2d550e1..b457fe216ae 100644 --- a/icu4c/source/tools/gennorm2/n2builder.cpp +++ b/icu4c/source/tools/gennorm2/n2builder.cpp @@ -30,7 +30,9 @@ #include "unicode/localpointer.h" #include "unicode/putil.h" #include "unicode/udata.h" +#include "unicode/uniset.h" #include "unicode/unistr.h" +#include "unicode/usetiter.h" #include "unicode/ustring.h" #include "charstr.h" #include "extradata.h" @@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { norms.createNorm(c)->cc=cc; + norms.ccSet.add(c); } static UBool isWellFormed(const UnicodeString &s) { @@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) p->mapping=new UnicodeString(m); p->mappingType=Norm::ONE_WAY; p->setMappingCP(); + norms.mappingSet.add(c); } void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { @@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString p->mapping=new UnicodeString(m); p->mappingType=Norm::ROUND_TRIP; p->mappingCP=U_SENTINEL; + norms.mappingSet.add(c); } void Normalizer2DataBuilder::removeMapping(UChar32 c) { // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. Norm *p=checkNormForMapping(norms.createNorm(c), c); p->mappingType=Norm::REMOVED; + norms.mappingSet.add(c); } UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const { @@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) { fclose(f); } +namespace { + +bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { + if(s1 == nullptr) { + return s2 == nullptr; + } else if(s2 == nullptr) { + return false; + } else { + return *s1 == *s2; + } +} + +const char *typeChars = "?-=>"; + +void writeMapping(FILE *f, const UnicodeString *m) { + if(m != nullptr && !m->isEmpty()) { + int32_t i = 0; + UChar32 c = m->char32At(i); + fprintf(f, "%04lX", (long)c); + while((i += U16_LENGTH(c)) < m->length()) { + c = m->char32At(i); + fprintf(f, " %04lX", (long)c); + } + } + fputs("\n", f); +} + +} // namespace + +void +Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { + // Do not processData() before writing the input-syntax data file. + FILE *f = fopen(filename, "w"); + if(f == nullptr) { + fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", + filename); + exit(U_FILE_ACCESS_ERROR); + return; + } + + if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || + unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { + char uv[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(unicodeVersion, uv); + fprintf(f, "* Unicode %s\n\n", uv); + } + + UnicodeSetIterator ccIter(norms.ccSet); + UChar32 start = U_SENTINEL; + UChar32 end = U_SENTINEL; + uint8_t prevCC = 0; + bool done = false; + bool didWrite = false; + do { + UChar32 c; + uint8_t cc; + if(ccIter.next() && !ccIter.isString()) { + c = ccIter.getCodepoint(); + cc = norms.getCC(c); + } else { + c = 0x110000; + cc = 0; + done = true; + } + if(cc == prevCC && c == (end + 1)) { + end = c; + } else { + if(prevCC != 0) { + if(start == end) { + fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC); + } else { + fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC); + } + didWrite = true; + } + start = end = c; + prevCC = cc; + } + } while(!done); + if(didWrite) { + fputs("\n", f); + } + + UnicodeSetIterator mIter(norms.mappingSet); + start = U_SENTINEL; + end = U_SENTINEL; + const UnicodeString *prevMapping = nullptr; + Norm::MappingType prevType = Norm::NONE; + done = false; + do { + UChar32 c; + const Norm *norm; + if(mIter.next() && !mIter.isString()) { + c = mIter.getCodepoint(); + norm = norms.getNorm(c); + } else { + c = 0x110000; + norm = nullptr; + done = true; + } + const UnicodeString *mapping; + Norm::MappingType type; + if(norm == nullptr) { + mapping = nullptr; + type = Norm::NONE; + } else { + type = norm->mappingType; + if(type == Norm::NONE) { + mapping = nullptr; + } else { + mapping = norm->mapping; + } + } + if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { + end = c; + } else { + if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { + if(start == end) { + fprintf(f, "%04lX%c", (long)start, typeChars[prevType]); + } else { + fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]); + } + writeMapping(f, prevMapping); + } + start = end = c; + prevMapping = mapping; + prevType = type; + } + } while(!done); + + fclose(f); +} + +void +Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, + const Normalizer2DataBuilder &b2, + Normalizer2DataBuilder &diff) { + // Compute diff = b1 - b2 + // so that we should be able to get b1 = b2 + diff. + if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { + memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); + } + + UnicodeSet ccSet(b1.norms.ccSet); + ccSet.addAll(b2.norms.ccSet); + UnicodeSetIterator ccIter(ccSet); + while(ccIter.next() && !ccIter.isString()) { + UChar32 c = ccIter.getCodepoint(); + uint8_t cc1 = b1.norms.getCC(c); + uint8_t cc2 = b2.norms.getCC(c); + if(cc1 != cc2) { + diff.setCC(c, cc1); + } + } + + UnicodeSet mSet(b1.norms.mappingSet); + mSet.addAll(b2.norms.mappingSet); + UnicodeSetIterator mIter(mSet); + while(mIter.next() && !mIter.isString()) { + UChar32 c = mIter.getCodepoint(); + const Norm *norm1 = b1.norms.getNorm(c); + const Norm *norm2 = b2.norms.getNorm(c); + const UnicodeString *mapping1; + Norm::MappingType type1; + if(norm1 == nullptr || !norm1->hasMapping()) { + mapping1 = nullptr; + type1 = Norm::NONE; + } else { + mapping1 = norm1->mapping; + type1 = norm1->mappingType; + } + const UnicodeString *mapping2; + Norm::MappingType type2; + if(norm2 == nullptr || !norm2->hasMapping()) { + mapping2 = nullptr; + type2 = Norm::NONE; + } else { + mapping2 = norm2->mapping; + type2 = norm2->mappingType; + } + if(type1 == type2 && equalStrings(mapping1, mapping2)) { + // Nothing to do. + } else if(type1 == Norm::NONE) { + diff.removeMapping(c); + } else if(type1 == Norm::ROUND_TRIP) { + diff.setRoundTripMapping(c, *mapping1); + } else if(type1 == Norm::ONE_WAY) { + diff.setOneWayMapping(c, *mapping1); + } + } +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_NORMALIZATION */ diff --git a/icu4c/source/tools/gennorm2/n2builder.h b/icu4c/source/tools/gennorm2/n2builder.h index c1421599dac..eb92bf382f3 100644 --- a/icu4c/source/tools/gennorm2/n2builder.h +++ b/icu4c/source/tools/gennorm2/n2builder.h @@ -63,6 +63,11 @@ public: void writeBinaryFile(const char *filename); void writeCSourceFile(const char *filename); + void writeDataFile(const char *filename, bool writeRemoved) const; + + static void computeDiff(const Normalizer2DataBuilder &b1, + const Normalizer2DataBuilder &b2, + Normalizer2DataBuilder &diff); private: friend class Norm16Writer; diff --git a/icu4c/source/tools/gennorm2/norms.h b/icu4c/source/tools/gennorm2/norms.h index d41e9329967..4dd82c78a5a 100644 --- a/icu4c/source/tools/gennorm2/norms.h +++ b/icu4c/source/tools/gennorm2/norms.h @@ -15,6 +15,7 @@ #if !UCONFIG_NO_NORMALIZATION #include "unicode/errorcode.h" +#include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/utf16.h" #include "normalizer2impl.h" @@ -183,6 +184,8 @@ public: void enumRanges(Enumerator &e); + UnicodeSet ccSet, mappingSet; + private: Norms(const Norms &other) = delete; Norms &operator=(const Norms &other) = delete;