From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 23 Aug 2017 23:33:47 +0000 (+0000)
Subject: ICU-13326 gennorm2 --combined option to write the combined data of the input files... 
X-Git-Tag: release-60-rc~175
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d287dbbe3042067b8690739c913c920f00cf90ef;p=icu

ICU-13326 gennorm2 --combined option to write the combined data of the input files; and gennorm2 minus operator to write the diffs of the combined data from two sets of input files

X-SVN-Rev: 40349
---

diff --git a/icu4c/source/tools/gennorm2/gennorm2.cpp b/icu4c/source/tools/gennorm2/gennorm2.cpp
index 477bfd62ad5..69d0a38c235 100644
--- a/icu4c/source/tools/gennorm2/gennorm2.cpp
+++ b/icu4c/source/tools/gennorm2/gennorm2.cpp
@@ -61,6 +61,7 @@ enum {
     OUTPUT_FILENAME,
     UNICODE_VERSION,
     WRITE_C_SOURCE,
+    WRITE_COMBINED_DATA,
     OPT_FAST
 };
 
@@ -73,6 +74,7 @@ static UOption options[]={
     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
+    UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
 };
 
@@ -96,17 +98,22 @@ main(int argc, char* argv[]) {
     if( argc<2 ||
         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
     ) {
-        /*
-         * Broken into chunks because the C89 standard says the minimum
-         * required supported string length is 509 bytes.
-         */
         fprintf(stderr,
             "Usage: %s [-options] infiles+ -o outputfilename\n"
             "\n"
             "Reads the infiles with normalization data and\n"
-            "creates a binary or C source file (outputfilename) with the data.\n"
+            "creates a binary file, or a C source file (--csource), with the data,\n"
+            "or writes a data file with the combined data (--combined).\n"
+            "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
+            "\n"
+            "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
+            "\n"
+            "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
+            "in input-file syntax to the outputfilename.\n"
+            "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
+            "(Useful for computing minimal incremental mapping data files.)\n"
             "\n",
-            argv[0]);
+            argv[0], argv[0]);
         fprintf(stderr,
             "Options:\n"
             "\t-h or -? or --help  this usage text\n"
@@ -116,7 +123,9 @@ main(int argc, char* argv[]) {
         fprintf(stderr,
             "\t-s or --sourcedir   source directory, followed by the path\n"
             "\t-o or --output      output filename\n"
-            "\t      --csource     writes a C source file with initializers\n");
+            "\t      --csource     writes a C source file with initializers\n"
+            "\t      --combined    writes a .txt file (input-file syntax) with the\n"
+            "\t                    combined data from all of the input files\n");
         fprintf(stderr,
             "\t      --fast        optimize the data for fast normalization,\n"
             "\t                    which might increase its size  (Writes fully decomposed\n"
@@ -144,7 +153,10 @@ main(int argc, char* argv[]) {
 
 #else
 
-    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
+    LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
+    LocalPointer<Normalizer2DataBuilder> b2;
+    LocalPointer<Normalizer2DataBuilder> diff;
+    Normalizer2DataBuilder *builder = b1.getAlias();
     errorCode.assertSuccess();
 
     if(options[UNICODE_VERSION].doesOccur) {
@@ -166,8 +178,29 @@ main(int argc, char* argv[]) {
         pathLength=filename.length();
     }
 
+    bool doMinus = false;
     for(int i=1; i<argc; ++i) {
         printf("gennorm2: processing %s\n", argv[i]);
+        if(strcmp(argv[i], "minus") == 0) {
+            if(doMinus) {
+                fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
+                exit(U_ILLEGAL_ARGUMENT_ERROR);
+            }
+            // Data from previous input files has been collected in b1.
+            // Collect data from further input files in b2.
+            b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            errorCode.assertSuccess();
+            builder = b2.getAlias();
+            if(options[UNICODE_VERSION].doesOccur) {
+                builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+            }
+            if(options[OPT_FAST].doesOccur) {
+                builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
+            }
+            doMinus = true;
+            continue;
+        }
         filename.append(argv[i], errorCode);
         LocalStdioFilePointer f(fopen(filename.data(), "r"));
         if(f==NULL) {
@@ -179,7 +212,12 @@ main(int argc, char* argv[]) {
         filename.truncate(pathLength);
     }
 
-    if(options[WRITE_C_SOURCE].doesOccur) {
+    if(doMinus) {
+        Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
+        diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
+    } else if(options[WRITE_COMBINED_DATA].doesOccur) {
+        builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
+    } else if(options[WRITE_C_SOURCE].doesOccur) {
         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
     } else {
         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp
index 9bad2d550e1..b457fe216ae 100644
--- a/icu4c/source/tools/gennorm2/n2builder.cpp
+++ b/icu4c/source/tools/gennorm2/n2builder.cpp
@@ -30,7 +30,9 @@
 #include "unicode/localpointer.h"
 #include "unicode/putil.h"
 #include "unicode/udata.h"
+#include "unicode/uniset.h"
 #include "unicode/unistr.h"
+#include "unicode/usetiter.h"
 #include "unicode/ustring.h"
 #include "charstr.h"
 #include "extradata.h"
@@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
 
 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
     norms.createNorm(c)->cc=cc;
+    norms.ccSet.add(c);
 }
 
 static UBool isWellFormed(const UnicodeString &s) {
@@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
     p->mapping=new UnicodeString(m);
     p->mappingType=Norm::ONE_WAY;
     p->setMappingCP();
+    norms.mappingSet.add(c);
 }
 
 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
@@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
     p->mapping=new UnicodeString(m);
     p->mappingType=Norm::ROUND_TRIP;
     p->mappingCP=U_SENTINEL;
+    norms.mappingSet.add(c);
 }
 
 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
     // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
     Norm *p=checkNormForMapping(norms.createNorm(c), c);
     p->mappingType=Norm::REMOVED;
+    norms.mappingSet.add(c);
 }
 
 UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
@@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
     fclose(f);
 }
 
+namespace {
+
+bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
+    if(s1 == nullptr) {
+        return s2 == nullptr;
+    } else if(s2 == nullptr) {
+        return false;
+    } else {
+        return *s1 == *s2;
+    }
+}
+
+const char *typeChars = "?-=>";
+
+void writeMapping(FILE *f, const UnicodeString *m) {
+    if(m != nullptr && !m->isEmpty()) {
+        int32_t i = 0;
+        UChar32 c = m->char32At(i);
+        fprintf(f, "%04lX", (long)c);
+        while((i += U16_LENGTH(c)) < m->length()) {
+            c = m->char32At(i);
+            fprintf(f, " %04lX", (long)c);
+        }
+    }
+    fputs("\n", f);
+}
+
+}  // namespace
+
+void
+Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
+    // Do not processData() before writing the input-syntax data file.
+    FILE *f = fopen(filename, "w");
+    if(f == nullptr) {
+        fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
+                filename);
+        exit(U_FILE_ACCESS_ERROR);
+        return;
+    }
+
+    if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
+            unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
+        char uv[U_MAX_VERSION_STRING_LENGTH];
+        u_versionToString(unicodeVersion, uv);
+        fprintf(f, "* Unicode %s\n\n", uv);
+    }
+
+    UnicodeSetIterator ccIter(norms.ccSet);
+    UChar32 start = U_SENTINEL;
+    UChar32 end = U_SENTINEL;
+    uint8_t prevCC = 0;
+    bool done = false;
+    bool didWrite = false;
+    do {
+        UChar32 c;
+        uint8_t cc;
+        if(ccIter.next() && !ccIter.isString()) {
+            c = ccIter.getCodepoint();
+            cc = norms.getCC(c);
+        } else {
+            c = 0x110000;
+            cc = 0;
+            done = true;
+        }
+        if(cc == prevCC && c == (end + 1)) {
+            end = c;
+        } else {
+            if(prevCC != 0) {
+                if(start == end) {
+                    fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
+                } else {
+                    fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
+                }
+                didWrite = true;
+            }
+            start = end = c;
+            prevCC = cc;
+        }
+    } while(!done);
+    if(didWrite) {
+        fputs("\n", f);
+    }
+
+    UnicodeSetIterator mIter(norms.mappingSet);
+    start = U_SENTINEL;
+    end = U_SENTINEL;
+    const UnicodeString *prevMapping = nullptr;
+    Norm::MappingType prevType = Norm::NONE;
+    done = false;
+    do {
+        UChar32 c;
+        const Norm *norm;
+        if(mIter.next() && !mIter.isString()) {
+            c = mIter.getCodepoint();
+            norm = norms.getNorm(c);
+        } else {
+            c = 0x110000;
+            norm = nullptr;
+            done = true;
+        }
+        const UnicodeString *mapping;
+        Norm::MappingType type;
+        if(norm == nullptr) {
+            mapping = nullptr;
+            type = Norm::NONE;
+        } else {
+            type = norm->mappingType;
+            if(type == Norm::NONE) {
+                mapping = nullptr;
+            } else {
+                mapping = norm->mapping;
+            }
+        }
+        if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
+            end = c;
+        } else {
+            if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
+                if(start == end) {
+                    fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
+                } else {
+                    fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
+                }
+                writeMapping(f, prevMapping);
+            }
+            start = end = c;
+            prevMapping = mapping;
+            prevType = type;
+        }
+    } while(!done);
+
+    fclose(f);
+}
+
+void
+Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
+                                    const Normalizer2DataBuilder &b2,
+                                    Normalizer2DataBuilder &diff) {
+    // Compute diff = b1 - b2
+    // so that we should be able to get b1 = b2 + diff.
+    if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
+        memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
+    }
+
+    UnicodeSet ccSet(b1.norms.ccSet);
+    ccSet.addAll(b2.norms.ccSet);
+    UnicodeSetIterator ccIter(ccSet);
+    while(ccIter.next() && !ccIter.isString()) {
+        UChar32 c = ccIter.getCodepoint();
+        uint8_t cc1 = b1.norms.getCC(c);
+        uint8_t cc2 = b2.norms.getCC(c);
+        if(cc1 != cc2) {
+            diff.setCC(c, cc1);
+        }
+    }
+
+    UnicodeSet mSet(b1.norms.mappingSet);
+    mSet.addAll(b2.norms.mappingSet);
+    UnicodeSetIterator mIter(mSet);
+    while(mIter.next() && !mIter.isString()) {
+        UChar32 c = mIter.getCodepoint();
+        const Norm *norm1 = b1.norms.getNorm(c);
+        const Norm *norm2 = b2.norms.getNorm(c);
+        const UnicodeString *mapping1;
+        Norm::MappingType type1;
+        if(norm1 == nullptr || !norm1->hasMapping()) {
+            mapping1 = nullptr;
+            type1 = Norm::NONE;
+        } else {
+            mapping1 = norm1->mapping;
+            type1 = norm1->mappingType;
+        }
+        const UnicodeString *mapping2;
+        Norm::MappingType type2;
+        if(norm2 == nullptr || !norm2->hasMapping()) {
+            mapping2 = nullptr;
+            type2 = Norm::NONE;
+        } else {
+            mapping2 = norm2->mapping;
+            type2 = norm2->mappingType;
+        }
+        if(type1 == type2 && equalStrings(mapping1, mapping2)) {
+            // Nothing to do.
+        } else if(type1 == Norm::NONE) {
+            diff.removeMapping(c);
+        } else if(type1 == Norm::ROUND_TRIP) {
+            diff.setRoundTripMapping(c, *mapping1);
+        } else if(type1 == Norm::ONE_WAY) {
+            diff.setOneWayMapping(c, *mapping1);
+        }
+    }
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_NORMALIZATION */
diff --git a/icu4c/source/tools/gennorm2/n2builder.h b/icu4c/source/tools/gennorm2/n2builder.h
index c1421599dac..eb92bf382f3 100644
--- a/icu4c/source/tools/gennorm2/n2builder.h
+++ b/icu4c/source/tools/gennorm2/n2builder.h
@@ -63,6 +63,11 @@ public:
 
     void writeBinaryFile(const char *filename);
     void writeCSourceFile(const char *filename);
+    void writeDataFile(const char *filename, bool writeRemoved) const;
+
+    static void computeDiff(const Normalizer2DataBuilder &b1,
+                            const Normalizer2DataBuilder &b2,
+                            Normalizer2DataBuilder &diff);
 
 private:
     friend class Norm16Writer;
diff --git a/icu4c/source/tools/gennorm2/norms.h b/icu4c/source/tools/gennorm2/norms.h
index d41e9329967..4dd82c78a5a 100644
--- a/icu4c/source/tools/gennorm2/norms.h
+++ b/icu4c/source/tools/gennorm2/norms.h
@@ -15,6 +15,7 @@
 #if !UCONFIG_NO_NORMALIZATION
 
 #include "unicode/errorcode.h"
+#include "unicode/uniset.h"
 #include "unicode/unistr.h"
 #include "unicode/utf16.h"
 #include "normalizer2impl.h"
@@ -183,6 +184,8 @@ public:
 
     void enumRanges(Enumerator &e);
 
+    UnicodeSet ccSet, mappingSet;
+
 private:
     Norms(const Norms &other) = delete;
     Norms &operator=(const Norms &other) = delete;