ICU-13326 gennorm2 --combined option to write the combined data of the input files...

author Markus Scherer <markus.icu@gmail.com>

Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)
author Markus Scherer <markus.icu@gmail.com>
Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)
diff --git a/icu4c/source/tools/gennorm2/gennorm2.cpp b/icu4c/source/tools/gennorm2/gennorm2.cpp

index 477bfd62ad5b861ba6c42b28a40f74f560bd2c90..69d0a38c2351a8e44df37fe367f4cd3dc6ba6d98 100644 (file)
--- a/icu4c/source/tools/gennorm2/gennorm2.cpp
+++ b/icu4c/source/tools/gennorm2/gennorm2.cpp
@@ -61,6 +61,7 @@ enum {
      OUTPUT_FILENAME,
      UNICODE_VERSION,
      WRITE_C_SOURCE,
+    WRITE_COMBINED_DATA,
      OPT_FAST
  };
  
@@ -73,6 +74,7 @@ static UOption options[]={
      UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
      UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
      UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
+    UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
      UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
  };
  
@@ -96,17 +98,22 @@ main(int argc, char* argv[]) {
      if( argc<2 ||
          options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
      ) {
-        /*
-         * Broken into chunks because the C89 standard says the minimum
-         * required supported string length is 509 bytes.
-         */
          fprintf(stderr,
              "Usage: %s [-options] infiles+ -o outputfilename\n"
              "\n"
              "Reads the infiles with normalization data and\n"
-            "creates a binary or C source file (outputfilename) with the data.\n"
+            "creates a binary file, or a C source file (--csource), with the data,\n"
+            "or writes a data file with the combined data (--combined).\n"
+            "See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
+            "\n"
+            "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
+            "\n"
+            "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
+            "in input-file syntax to the outputfilename.\n"
+            "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
+            "(Useful for computing minimal incremental mapping data files.)\n"
              "\n",
-            argv[0]);
+            argv[0], argv[0]);
          fprintf(stderr,
              "Options:\n"
              "\t-h or -? or --help  this usage text\n"
@@ -116,7 +123,9 @@ main(int argc, char* argv[]) {
          fprintf(stderr,
              "\t-s or --sourcedir   source directory, followed by the path\n"
              "\t-o or --output      output filename\n"
-            "\t      --csource     writes a C source file with initializers\n");
+            "\t      --csource     writes a C source file with initializers\n"
+            "\t      --combined    writes a .txt file (input-file syntax) with the\n"
+            "\t                    combined data from all of the input files\n");
          fprintf(stderr,
              "\t      --fast        optimize the data for fast normalization,\n"
              "\t                    which might increase its size  (Writes fully decomposed\n"
@@ -144,7 +153,10 @@ main(int argc, char* argv[]) {
  
  #else
  
-    LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
+    LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
+    LocalPointer<Normalizer2DataBuilder> b2;
+    LocalPointer<Normalizer2DataBuilder> diff;
+    Normalizer2DataBuilder *builder = b1.getAlias();
      errorCode.assertSuccess();
  
      if(options[UNICODE_VERSION].doesOccur) {
@@ -166,8 +178,29 @@ main(int argc, char* argv[]) {
          pathLength=filename.length();
      }
  
+    bool doMinus = false;
      for(int i=1; i<argc; ++i) {
          printf("gennorm2: processing %s\n", argv[i]);
+        if(strcmp(argv[i], "minus") == 0) {
+            if(doMinus) {
+                fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
+                exit(U_ILLEGAL_ARGUMENT_ERROR);
+            }
+            // Data from previous input files has been collected in b1.
+            // Collect data from further input files in b2.
+            b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
+            errorCode.assertSuccess();
+            builder = b2.getAlias();
+            if(options[UNICODE_VERSION].doesOccur) {
+                builder->setUnicodeVersion(options[UNICODE_VERSION].value);
+            }
+            if(options[OPT_FAST].doesOccur) {
+                builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
+            }
+            doMinus = true;
+            continue;
+        }
          filename.append(argv[i], errorCode);
          LocalStdioFilePointer f(fopen(filename.data(), "r"));
          if(f==NULL) {
@@ -179,7 +212,12 @@ main(int argc, char* argv[]) {
          filename.truncate(pathLength);
      }
  
-    if(options[WRITE_C_SOURCE].doesOccur) {
+    if(doMinus) {
+        Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
+        diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
+    } else if(options[WRITE_COMBINED_DATA].doesOccur) {
+        builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
+    } else if(options[WRITE_C_SOURCE].doesOccur) {
          builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
      } else {
          builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp

index 9bad2d550e15f10ebf586ecef0905c524ff3c375..b457fe216aeb0e1530f0a9d417f1ccbbaf4fb2c0 100644 (file)
--- a/icu4c/source/tools/gennorm2/n2builder.cpp
+++ b/icu4c/source/tools/gennorm2/n2builder.cpp
@@ -30,7 +30,9 @@
  #include "unicode/localpointer.h"
  #include "unicode/putil.h"
  #include "unicode/udata.h"
+#include "unicode/uniset.h"
  #include "unicode/unistr.h"
+#include "unicode/usetiter.h"
  #include "unicode/ustring.h"
  #include "charstr.h"
  #include "extradata.h"
@@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
  
  void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
      norms.createNorm(c)->cc=cc;
+    norms.ccSet.add(c);
  }
  
  static UBool isWellFormed(const UnicodeString &s) {
@@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
      p->mapping=new UnicodeString(m);
      p->mappingType=Norm::ONE_WAY;
      p->setMappingCP();
+    norms.mappingSet.add(c);
  }
  
  void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
@@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
      p->mapping=new UnicodeString(m);
      p->mappingType=Norm::ROUND_TRIP;
      p->mappingCP=U_SENTINEL;
+    norms.mappingSet.add(c);
  }
  
  void Normalizer2DataBuilder::removeMapping(UChar32 c) {
      // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
      Norm *p=checkNormForMapping(norms.createNorm(c), c);
      p->mappingType=Norm::REMOVED;
+    norms.mappingSet.add(c);
  }
  
  UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
@@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
      fclose(f);
  }
  
+namespace {
+
+bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
+    if(s1 == nullptr) {
+        return s2 == nullptr;
+    } else if(s2 == nullptr) {
+        return false;
+    } else {
+        return *s1 == *s2;
+    }
+}
+
+const char *typeChars = "?-=>";
+
+void writeMapping(FILE *f, const UnicodeString *m) {
+    if(m != nullptr && !m->isEmpty()) {
+        int32_t i = 0;
+        UChar32 c = m->char32At(i);
+        fprintf(f, "%04lX", (long)c);
+        while((i += U16_LENGTH(c)) < m->length()) {
+            c = m->char32At(i);
+            fprintf(f, " %04lX", (long)c);
+        }
+    }
+    fputs("\n", f);
+}
+
+}  // namespace
+
+void
+Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
+    // Do not processData() before writing the input-syntax data file.
+    FILE *f = fopen(filename, "w");
+    if(f == nullptr) {
+        fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
+                filename);
+        exit(U_FILE_ACCESS_ERROR);
+        return;
+    }
+
+    if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
+            unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
+        char uv[U_MAX_VERSION_STRING_LENGTH];
+        u_versionToString(unicodeVersion, uv);
+        fprintf(f, "* Unicode %s\n\n", uv);
+    }
+
+    UnicodeSetIterator ccIter(norms.ccSet);
+    UChar32 start = U_SENTINEL;
+    UChar32 end = U_SENTINEL;
+    uint8_t prevCC = 0;
+    bool done = false;
+    bool didWrite = false;
+    do {
+        UChar32 c;
+        uint8_t cc;
+        if(ccIter.next() && !ccIter.isString()) {
+            c = ccIter.getCodepoint();
+            cc = norms.getCC(c);
+        } else {
+            c = 0x110000;
+            cc = 0;
+            done = true;
+        }
+        if(cc == prevCC && c == (end + 1)) {
+            end = c;
+        } else {
+            if(prevCC != 0) {
+                if(start == end) {
+                    fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
+                } else {
+                    fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
+                }
+                didWrite = true;
+            }
+            start = end = c;
+            prevCC = cc;
+        }
+    } while(!done);
+    if(didWrite) {
+        fputs("\n", f);
+    }
+
+    UnicodeSetIterator mIter(norms.mappingSet);
+    start = U_SENTINEL;
+    end = U_SENTINEL;
+    const UnicodeString *prevMapping = nullptr;
+    Norm::MappingType prevType = Norm::NONE;
+    done = false;
+    do {
+        UChar32 c;
+        const Norm *norm;
+        if(mIter.next() && !mIter.isString()) {
+            c = mIter.getCodepoint();
+            norm = norms.getNorm(c);
+        } else {
+            c = 0x110000;
+            norm = nullptr;
+            done = true;
+        }
+        const UnicodeString *mapping;
+        Norm::MappingType type;
+        if(norm == nullptr) {
+            mapping = nullptr;
+            type = Norm::NONE;
+        } else {
+            type = norm->mappingType;
+            if(type == Norm::NONE) {
+                mapping = nullptr;
+            } else {
+                mapping = norm->mapping;
+            }
+        }
+        if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
+            end = c;
+        } else {
+            if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
+                if(start == end) {
+                    fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
+                } else {
+                    fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
+                }
+                writeMapping(f, prevMapping);
+            }
+            start = end = c;
+            prevMapping = mapping;
+            prevType = type;
+        }
+    } while(!done);
+
+    fclose(f);
+}
+
+void
+Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
+                                    const Normalizer2DataBuilder &b2,
+                                    Normalizer2DataBuilder &diff) {
+    // Compute diff = b1 - b2
+    // so that we should be able to get b1 = b2 + diff.
+    if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
+        memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
+    }
+
+    UnicodeSet ccSet(b1.norms.ccSet);
+    ccSet.addAll(b2.norms.ccSet);
+    UnicodeSetIterator ccIter(ccSet);
+    while(ccIter.next() && !ccIter.isString()) {
+        UChar32 c = ccIter.getCodepoint();
+        uint8_t cc1 = b1.norms.getCC(c);
+        uint8_t cc2 = b2.norms.getCC(c);
+        if(cc1 != cc2) {
+            diff.setCC(c, cc1);
+        }
+    }
+
+    UnicodeSet mSet(b1.norms.mappingSet);
+    mSet.addAll(b2.norms.mappingSet);
+    UnicodeSetIterator mIter(mSet);
+    while(mIter.next() && !mIter.isString()) {
+        UChar32 c = mIter.getCodepoint();
+        const Norm *norm1 = b1.norms.getNorm(c);
+        const Norm *norm2 = b2.norms.getNorm(c);
+        const UnicodeString *mapping1;
+        Norm::MappingType type1;
+        if(norm1 == nullptr || !norm1->hasMapping()) {
+            mapping1 = nullptr;
+            type1 = Norm::NONE;
+        } else {
+            mapping1 = norm1->mapping;
+            type1 = norm1->mappingType;
+        }
+        const UnicodeString *mapping2;
+        Norm::MappingType type2;
+        if(norm2 == nullptr || !norm2->hasMapping()) {
+            mapping2 = nullptr;
+            type2 = Norm::NONE;
+        } else {
+            mapping2 = norm2->mapping;
+            type2 = norm2->mappingType;
+        }
+        if(type1 == type2 && equalStrings(mapping1, mapping2)) {
+            // Nothing to do.
+        } else if(type1 == Norm::NONE) {
+            diff.removeMapping(c);
+        } else if(type1 == Norm::ROUND_TRIP) {
+            diff.setRoundTripMapping(c, *mapping1);
+        } else if(type1 == Norm::ONE_WAY) {
+            diff.setOneWayMapping(c, *mapping1);
+        }
+    }
+}
+
  U_NAMESPACE_END
  
  #endif /* #if !UCONFIG_NO_NORMALIZATION */
diff --git a/icu4c/source/tools/gennorm2/n2builder.h b/icu4c/source/tools/gennorm2/n2builder.h

index c1421599dac7adacbf137dcdc143d541d6020e5d..eb92bf382f3539dfb7cb65576145218342bd9455 100644 (file)
--- a/icu4c/source/tools/gennorm2/n2builder.h
+++ b/icu4c/source/tools/gennorm2/n2builder.h
@@ -63,6 +63,11 @@ public:
  
      void writeBinaryFile(const char *filename);
      void writeCSourceFile(const char *filename);
+    void writeDataFile(const char *filename, bool writeRemoved) const;
+
+    static void computeDiff(const Normalizer2DataBuilder &b1,
+                            const Normalizer2DataBuilder &b2,
+                            Normalizer2DataBuilder &diff);
  
  private:
      friend class Norm16Writer;
diff --git a/icu4c/source/tools/gennorm2/norms.h b/icu4c/source/tools/gennorm2/norms.h

index d41e93299675fc18aa9f671cd2d343511efb9240..4dd82c78a5a5afe967d13ff7cc57890cb0487731 100644 (file)
--- a/icu4c/source/tools/gennorm2/norms.h
+++ b/icu4c/source/tools/gennorm2/norms.h
@@ -15,6 +15,7 @@
  #if !UCONFIG_NO_NORMALIZATION
  
  #include "unicode/errorcode.h"
+#include "unicode/uniset.h"
  #include "unicode/unistr.h"
  #include "unicode/utf16.h"
  #include "normalizer2impl.h"
@@ -183,6 +184,8 @@ public:
  
      void enumRanges(Enumerator &e);
  
+    UnicodeSet ccSet, mappingSet;
+
  private:
      Norms(const Norms &other) = delete;
      Norms &operator=(const Norms &other) = delete;
author	Markus Scherer <markus.icu@gmail.com>
	Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Wed, 23 Aug 2017 23:33:47 +0000 (23:33 +0000)
icu4c/source/tools/gennorm2/gennorm2.cpp		patch \| blob \| history
icu4c/source/tools/gennorm2/n2builder.cpp		patch \| blob \| history
icu4c/source/tools/gennorm2/n2builder.h		patch \| blob \| history
icu4c/source/tools/gennorm2/norms.h		patch \| blob \| history