ICU-13197 split gennorm2/n2builder into 3 more manageable pieces; no output change
authorMarkus Scherer <markus.icu@gmail.com>
Mon, 5 Jun 2017 03:53:14 +0000 (03:53 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Mon, 5 Jun 2017 03:53:14 +0000 (03:53 +0000)
X-SVN-Rev: 40150

icu4c/source/tools/gennorm2/Makefile.in
icu4c/source/tools/gennorm2/extradata.cpp [new file with mode: 0644]
icu4c/source/tools/gennorm2/extradata.h [new file with mode: 0644]
icu4c/source/tools/gennorm2/gennorm2.vcxproj
icu4c/source/tools/gennorm2/n2builder.cpp
icu4c/source/tools/gennorm2/n2builder.h
icu4c/source/tools/gennorm2/norms.cpp [new file with mode: 0644]
icu4c/source/tools/gennorm2/norms.h [new file with mode: 0644]

index 729a3d0c905b062ad4d2ee32bb4d77eb3d9a235c..afc3f9b00e65ba0c3936dc860ec8dae8a8cd067a 100644 (file)
@@ -27,7 +27,7 @@ TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
 CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil
 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
 
-OBJECTS = gennorm2.o n2builder.o
+OBJECTS = gennorm2.o n2builder.o extradata.o norms.o
 
 DEPS = $(OBJECTS:.o=.d)
 
diff --git a/icu4c/source/tools/gennorm2/extradata.cpp b/icu4c/source/tools/gennorm2/extradata.cpp
new file mode 100644 (file)
index 0000000..3a37554
--- /dev/null
@@ -0,0 +1,277 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// extradata.cpp
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/errorcode.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "extradata.h"
+#include "normalizer2impl.h"
+#include "norms.h"
+#include "toolutil.h"
+#include "utrie2.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+ExtraData::ExtraData(Norms &n, UBool fast) :
+        Norms::Enumerator(n),
+        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
+        yesNoMappingsAndCompositions(1000, (UChar32)0, 1),  // 0=Hangul, 1=start of normal data
+        optimizeFast(fast) {
+    memset(smallFCD, 0, sizeof(smallFCD));
+}
+
+void ExtraData::setSmallFCD(UChar32 c) {
+    UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
+    smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
+}
+
+int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
+    UnicodeString &m=*norm.mapping;
+    int32_t length=m.length();
+    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "mapping for U+%04lX longer than maximum of %d\n",
+                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    int32_t leadCC, trailCC;
+    if(length==0) {
+        leadCC=trailCC=0;
+    } else {
+        leadCC=norms.getCC(m.char32At(0));
+        trailCC=norms.getCC(m.char32At(length-1));
+    }
+    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || leadCC!=0)) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
+                (long)c);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    // Write small-FCD data.
+    if((leadCC|trailCC)!=0) {
+        setSmallFCD(c);
+    }
+    // Write the mapping & raw mapping extraData.
+    int32_t firstUnit=length|(trailCC<<8);
+    int32_t preMappingLength=0;
+    if(norm.rawMapping!=NULL) {
+        UnicodeString &rm=*norm.rawMapping;
+        int32_t rmLength=rm.length();
+        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
+            fprintf(stderr,
+                    "gennorm2 error: "
+                    "raw mapping for U+%04lX longer than maximum of %d\n",
+                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
+            exit(U_INVALID_FORMAT_ERROR);
+        }
+        UChar rm0=rm.charAt(0);
+        if( rmLength==length-1 &&
+            // 99: overlong substring lengths get pinned to remainder lengths anyway
+            0==rm.compare(1, 99, m, 2, 99) &&
+            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
+        ) {
+            // Compression:
+            // rawMapping=rm0+mapping.substring(2) -> store only rm0
+            //
+            // The raw mapping is the same as the final mapping after replacing
+            // the final mapping's first two code units with the raw mapping's first one.
+            // In this case, we store only that first unit, rm0.
+            // This helps with a few hundred mappings.
+            dataString.append(rm0);
+            preMappingLength=1;
+        } else {
+            // Store the raw mapping with its length.
+            dataString.append(rm);
+            dataString.append((UChar)rmLength);
+            preMappingLength=rmLength+1;
+        }
+        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
+    }
+    int32_t cccLccc=norm.cc|(leadCC<<8);
+    if(cccLccc!=0) {
+        dataString.append((UChar)cccLccc);
+        ++preMappingLength;
+        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
+    }
+    if(norm.hasNoCompBoundaryAfter) {
+        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
+    }
+    dataString.append((UChar)firstUnit);
+    dataString.append(m);
+    return preMappingLength;
+}
+
+int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
+                                    UnicodeString &dataString,
+                                    Hashtable &previousMappings) {
+    int32_t oldLength=dataString.length();
+    int32_t offset=oldLength+writeMapping(c, norm, dataString);
+    UnicodeString newMapping=dataString.tempSubString(oldLength);
+    int32_t previousOffset=previousMappings.geti(newMapping);
+    if(previousOffset!=0) {
+        // Duplicate, remove the new units and point to the old ones.
+        dataString.truncate(oldLength);
+        offset=previousOffset-1;
+    } else {
+        // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
+        IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
+        previousMappings.puti(newMapping, offset+1, errorCode);
+    }
+    return offset;
+}
+
+void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
+    if(norm.cc!=0) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
+                (long)c);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    int32_t length;
+    const CompositionPair *pairs=norm.getCompositionPairs(length);
+    for(int32_t i=0; i<length; ++i) {
+        const CompositionPair &pair=pairs[i];
+        // 22 bits for the composite character and whether it combines forward.
+        UChar32 compositeAndFwd=pair.composite<<1;
+        if(norms.getNormRef(pair.composite).compositions!=NULL) {
+            compositeAndFwd|=1;  // The composite character also combines-forward.
+        }
+        // Encode most pairs in two units and some in three.
+        int32_t firstUnit, secondUnit, thirdUnit;
+        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
+            if(compositeAndFwd<=0xffff) {
+                firstUnit=pair.trail<<1;
+                secondUnit=compositeAndFwd;
+                thirdUnit=-1;
+            } else {
+                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
+                secondUnit=compositeAndFwd>>16;
+                thirdUnit=compositeAndFwd;
+            }
+        } else {
+            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
+                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
+                      Normalizer2Impl::COMP_1_TRIPLE;
+            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
+                       (compositeAndFwd>>16);
+            thirdUnit=compositeAndFwd;
+        }
+        // Set the high bit of the first unit if this is the last composition pair.
+        if(i==(length-1)) {
+            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
+        }
+        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
+        if(thirdUnit>=0) {
+            dataString.append((UChar)thirdUnit);
+        }
+    }
+}
+
+void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+    if(start!=end) {
+        fprintf(stderr,
+                "gennorm2 error: unexpected shared data for "
+                "multiple code points U+%04lX..U+%04lX\n",
+                (long)start, (long)end);
+        exit(U_INTERNAL_PROGRAM_ERROR);
+    }
+    writeExtraData(start, norm);
+}
+
+void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
+    if(!norm.hasMapping()) {
+        // Write small-FCD data.
+        // There is similar code in writeMapping() for characters that do have a mapping.
+        if(norm.cc!=0) {
+            if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
+                fprintf(stderr,
+                        "gennorm2 error: "
+                        "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
+                        (long)c);
+                exit(U_INVALID_FORMAT_ERROR);
+            }
+            setSmallFCD(c);
+        }
+    }
+    if(norm.combinesBack) {
+        if(norm.hasMapping()) {
+            fprintf(stderr,
+                    "gennorm2 error: "
+                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
+                    (long)c);
+            exit(U_INVALID_FORMAT_ERROR);
+        }
+        if(norm.compositions!=NULL) {
+            norm.offset=
+                (maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
+                Norm::OFFSET_MAYBE_YES;
+            writeCompositions(c, norm, maybeYesCompositions);
+        }
+    } else if(!norm.hasMapping()) {
+        if(norm.compositions!=NULL) {
+            norm.offset=
+                (yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
+                Norm::OFFSET_YES_YES;
+            writeCompositions(c, norm, yesYesCompositions);
+        }
+    } else if(norm.mappingType==Norm::ROUND_TRIP) {
+        if(norm.compositions!=NULL) {
+            int32_t offset=yesNoMappingsAndCompositions.length()+
+                           writeMapping(c, norm, yesNoMappingsAndCompositions);
+            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
+            writeCompositions(c, norm, yesNoMappingsAndCompositions);
+        } else {
+            int32_t offset=yesNoMappingsOnly.length()+
+                           writeMapping(c, norm, yesNoMappingsOnly);
+            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
+        }
+    } else /* one-way */ {
+        if(norm.compositions!=NULL) {
+            fprintf(stderr,
+                    "gennorm2 error: "
+                    "U+%04lX combines-forward and has a one-way mapping, "
+                    "not possible in Unicode normalization\n",
+                    (long)c);
+            exit(U_INVALID_FORMAT_ERROR);
+        }
+        if(norm.cc==0 && !optimizeFast) {
+            // Try a compact, algorithmic encoding.
+            // Only for ccc=0, because we can't store additional information
+            // and we do not recursively follow an algorithmic encoding for access to the ccc.
+            //
+            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
+            // if the mappingCP decomposes further, to ensure that there is a place to store it.
+            // We want to see that the final mapping does not have exactly 1 code point,
+            // or else we would have to recursively ensure that the final mapping is stored
+            // in normal extraData.
+            if(norm.mappingCP>=0 &&
+                    (!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
+                int32_t delta=norm.mappingCP-c;
+                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
+                    norm.offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
+                }
+            }
+        }
+        if(norm.offset==0) {
+            // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
+            int32_t offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
+            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
+        }
+    }
+}
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION
diff --git a/icu4c/source/tools/gennorm2/extradata.h b/icu4c/source/tools/gennorm2/extradata.h
new file mode 100644 (file)
index 0000000..f652ed5
--- /dev/null
@@ -0,0 +1,65 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// extradata.h
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+// Write mappings and compositions in compact form for Normalizer2 "extra data",
+// the data that does not fit into the trie itself.
+
+#ifndef __EXTRADATA_H__
+#define __EXTRADATA_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/errorcode.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "hash.h"
+#include "norms.h"
+#include "toolutil.h"
+#include "utrie2.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+class ExtraData : public Norms::Enumerator {
+public:
+    ExtraData(Norms &n, UBool fast);
+
+    void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
+
+    UnicodeString maybeYesCompositions;
+    UnicodeString yesYesCompositions;
+    UnicodeString yesNoMappingsAndCompositions;
+    UnicodeString yesNoMappingsOnly;
+    UnicodeString noNoMappings;
+    uint8_t smallFCD[0x100];
+
+private:
+    void setSmallFCD(UChar32 c);
+    /**
+     * Requires norm.hasMapping().
+     * Returns the offset of the "first unit" from the beginning of the extraData for c.
+     * That is the same as the length of the optional data
+     * for the raw mapping and the ccc/lccc word.
+     */
+    int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString);
+    int32_t writeNoNoMapping(UChar32 c, const Norm &norm,
+                             UnicodeString &dataString, Hashtable &previousMappings);
+    /** Requires norm.compositions!=nullptr. */
+    void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString);
+    void writeExtraData(UChar32 c, Norm &norm);
+
+    UBool optimizeFast;
+    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
+};
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION
+
+#endif  // __EXTRADATA_H__
index 9f512f1205d6ac5c4c402e07f4375c2b80e5f55e..9feb10e45a0b9983790f5917879e2bc5a597a49a 100644 (file)
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="extradata.cpp" />
     <ClCompile Include="gennorm2.cpp" />
     <ClCompile Include="n2builder.cpp" />
+    <ClCompile Include="norms.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="extradata.h" />
     <ClInclude Include="n2builder.h" />
+    <ClInclude Include="norms.h" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\..\common\common.vcxproj">
index 66ccd700344127f71d48d793373dab2ad6779ed0..8bc5812d129577c0423a9ba30c7bae795fe98585 100644 (file)
 #include "unicode/localpointer.h"
 #include "unicode/putil.h"
 #include "unicode/udata.h"
-#include "unicode/uniset.h"
 #include "unicode/unistr.h"
 #include "unicode/ustring.h"
 #include "charstr.h"
+#include "extradata.h"
 #include "hash.h"
 #include "normalizer2impl.h"
+#include "norms.h"
 #include "toolutil.h"
 #include "unewdata.h"
 #include "utrie2.h"
@@ -90,105 +91,16 @@ const HangulIterator::Range HangulIterator::ranges[4]={
     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
 };
 
-struct CompositionPair {
-    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
-    UChar32 trail, composite;
-};
-
-struct Norm {
-    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
-
-    UBool hasMapping() const { return mappingType>REMOVED; }
-
-    // Requires hasMapping() and well-formed mapping.
-    void setMappingCP() {
-        UChar32 c;
-        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
-            mappingCP=c;
-        } else {
-            mappingCP=U_SENTINEL;
-        }
-    }
-
-    const CompositionPair *getCompositionPairs(int32_t &length) const {
-        if(compositions==NULL) {
-            length=0;
-            return NULL;
-        } else {
-            length=compositions->size()/2;
-            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
-        }
-    }
-
-    UnicodeString *mapping;
-    UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
-    UChar32 mappingCP;  // >=0 if mapping to 1 code point
-    int32_t mappingPhase;
-    MappingType mappingType;
-
-    UVector32 *compositions;  // (trail, composite) pairs
-    uint8_t cc;
-    UBool combinesBack;
-    UBool hasNoCompBoundaryAfter;
-
-    enum OffsetType {
-        OFFSET_NONE,
-        // Composition for back-combining character. Allowed, but not normally used.
-        OFFSET_MAYBE_YES,
-        // Composition for a starter that does not have a decomposition mapping.
-        OFFSET_YES_YES,
-        // Round-trip mapping & composition for a starter.
-        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
-        // Round-trip mapping for a starter that itself does not combine-forward.
-        OFFSET_YES_NO_MAPPING_ONLY,
-        // One-way mapping.
-        OFFSET_NO_NO,
-        // Delta for an algorithmic one-way mapping.
-        OFFSET_DELTA
-    };
-    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
-    int32_t offset;
-};
-
-class Normalizer2DBEnumerator {
-public:
-    Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
-    virtual ~Normalizer2DBEnumerator() {}
-    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
-    Normalizer2DBEnumerator *ptr() { return this; }
-protected:
-    Normalizer2DataBuilder &builder;
-};
-
-U_CDECL_BEGIN
-
-static UBool U_CALLCONV
-enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
-    return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
-}
-
-U_CDECL_END
-
 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
+        norms(errorCode),
         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
-        norm16TrieLength(0) {
+        norm16Trie(nullptr), norm16TrieLength(0) {
     memset(unicodeVersion, 0, sizeof(unicodeVersion));
-    normTrie=utrie2_open(0, 0, &errorCode);
-    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
-    norms=allocNorm();  // unused Norm struct at index 0
     memset(indexes, 0, sizeof(indexes));
     memset(smallFCD, 0, sizeof(smallFCD));
 }
 
 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
-    utrie2_close(normTrie);
-    int32_t normsLength=utm_countItems(normMem);
-    for(int32_t i=1; i<normsLength; ++i) {
-        delete norms[i].mapping;
-        delete norms[i].rawMapping;
-        delete norms[i].compositions;
-    }
-    utm_close(normMem);
     utrie2_close(norm16Trie);
 }
 
@@ -209,42 +121,6 @@ Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
 }
 
-Norm *Normalizer2DataBuilder::allocNorm() {
-    Norm *p=(Norm *)utm_alloc(normMem);
-    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
-    return p;
-}
-
-/* get an existing Norm unit */
-Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
-    uint32_t i=utrie2_get32(normTrie, c);
-    if(i==0) {
-        return NULL;
-    }
-    return norms+i;
-}
-
-const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
-    return norms[utrie2_get32(normTrie, c)];
-}
-
-/*
- * get or create a Norm unit;
- * get or create the intermediate trie entries for it as well
- */
-Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
-    uint32_t i=utrie2_get32(normTrie, c);
-    if(i!=0) {
-        return norms+i;
-    } else {
-        /* allocate Norm */
-        Norm *p=allocNorm();
-        IcuToolErrorCode errorCode("gennorm2/createNorm()");
-        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
-        return p;
-    }
-}
-
 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
     if(p!=NULL) {
         if(p->mappingType!=Norm::NONE) {
@@ -271,11 +147,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
 }
 
 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
-    createNorm(c)->cc=cc;
-}
-
-uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
-    return getNormRef(c).cc;
+    norms.createNorm(c)->cc=cc;
 }
 
 static UBool isWellFormed(const UnicodeString &s) {
@@ -292,7 +164,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
                 (int)phase, (long)c);
         exit(U_INVALID_FORMAT_ERROR);
     }
-    Norm *p=checkNormForMapping(createNorm(c), c);
+    Norm *p=checkNormForMapping(norms.createNorm(c), c);
     p->mapping=new UnicodeString(m);
     p->mappingType=Norm::ONE_WAY;
     p->setMappingCP();
@@ -321,310 +193,19 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
                 (int)phase, (long)c, (int)numCP);
         exit(U_INVALID_FORMAT_ERROR);
     }
-    Norm *p=checkNormForMapping(createNorm(c), c);
+    Norm *p=checkNormForMapping(norms.createNorm(c), c);
     p->mapping=new UnicodeString(m);
     p->mappingType=Norm::ROUND_TRIP;
     p->mappingCP=U_SENTINEL;
 }
 
 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
-    Norm *p=checkNormForMapping(getNorm(c), c);
+    Norm *p=checkNormForMapping(norms.getNorm(c), c);
     if(p!=NULL) {
         p->mappingType=Norm::REMOVED;
     }
 }
 
-class CompositionBuilder : public Normalizer2DBEnumerator {
-public:
-    CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
-    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        builder.addComposition(start, end, value);
-        return TRUE;
-    }
-};
-
-void
-Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
-    if(norms[value].mappingType==Norm::ROUND_TRIP) {
-        if(start!=end) {
-            fprintf(stderr,
-                    "gennorm2 error: same round-trip mapping for "
-                    "more than 1 code point U+%04lX..U+%04lX\n",
-                    (long)start, (long)end);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        if(norms[value].cc!=0) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX has a round-trip mapping and ccc!=0, "
-                    "not possible in Unicode normalization\n",
-                    (long)start);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        // setRoundTripMapping() ensured that there are exactly two code points.
-        const UnicodeString &m=*norms[value].mapping;
-        UChar32 lead=m.char32At(0);
-        UChar32 trail=m.char32At(m.length()-1);
-        if(getCC(lead)!=0) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
-                    "not possible in Unicode normalization\n",
-                    (long)start, (long)lead);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        // Flag for trailing character.
-        createNorm(trail)->combinesBack=TRUE;
-        // Insert (trail, composite) pair into compositions list for the lead character.
-        IcuToolErrorCode errorCode("gennorm2/addComposition()");
-        Norm *leadNorm=createNorm(lead);
-        UVector32 *compositions=leadNorm->compositions;
-        int32_t i;
-        if(compositions==NULL) {
-            compositions=leadNorm->compositions=new UVector32(errorCode);
-            i=0;  // "insert" the first pair at index 0
-        } else {
-            // Insertion sort, and check for duplicate trail characters.
-            int32_t length;
-            const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
-            for(i=0; i<length; ++i) {
-                if(trail==pairs[i].trail) {
-                    fprintf(stderr,
-                            "gennorm2 error: same round-trip mapping for "
-                            "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
-                            (long)start, (long)lead, (long)trail);
-                    exit(U_INVALID_FORMAT_ERROR);
-                }
-                if(trail<pairs[i].trail) {
-                    break;
-                }
-            }
-        }
-        compositions->insertElementAt(trail, 2*i, errorCode);
-        compositions->insertElementAt(start, 2*i+1, errorCode);
-    }
-}
-
-UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
-                                                    uint8_t lowCC, uint8_t highCC) const {
-    if((highCC-lowCC)>=2) {
-        int32_t length;
-        const CompositionPair *pairs=norm.getCompositionPairs(length);
-        for(int32_t i=0; i<length; ++i) {
-            uint8_t trailCC=getCC(pairs[i].trail);
-            if(lowCC<trailCC && trailCC<highCC) {
-                return TRUE;
-            }
-        }
-    }
-    return FALSE;
-}
-
-UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
-    int32_t length;
-    const CompositionPair *pairs=norm.getCompositionPairs(length);
-    for(int32_t i=0; i<length; ++i) {
-        if(trail==pairs[i].trail) {
-            return pairs[i].composite;
-        }
-        if(trail<pairs[i].trail) {
-            break;
-        }
-    }
-    return U_SENTINEL;
-}
-
-class Decomposer : public Normalizer2DBEnumerator {
-public:
-    Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
-    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        didDecompose|=builder.decompose(start, end, value);
-        return TRUE;
-    }
-    UBool didDecompose;
-};
-
-UBool
-Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
-    if(norms[value].hasMapping()) {
-        Norm &norm=norms[value];
-        const UnicodeString &m=*norm.mapping;
-        UnicodeString *decomposed=NULL;
-        const UChar *s=toUCharPtr(m.getBuffer());
-        int32_t length=m.length();
-        int32_t prev, i=0;
-        UChar32 c;
-        while(i<length) {
-            prev=i;
-            U16_NEXT(s, i, length, c);
-            if(start<=c && c<=end) {
-                fprintf(stderr,
-                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
-                        (long)c);
-                exit(U_INVALID_FORMAT_ERROR);
-            }
-            const Norm &cNorm=getNormRef(c);
-            if(cNorm.hasMapping()) {
-                if(norm.mappingType==Norm::ROUND_TRIP) {
-                    if(prev==0) {
-                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
-                            fprintf(stderr,
-                                    "gennorm2 error: "
-                                    "U+%04lX's round-trip mapping's starter "
-                                    "U+%04lX one-way-decomposes, "
-                                    "not possible in Unicode normalization\n",
-                                    (long)start, (long)c);
-                            exit(U_INVALID_FORMAT_ERROR);
-                        }
-                        uint8_t myTrailCC=getCC(m.char32At(i));
-                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
-                        uint8_t cTrailCC=getCC(cTrailChar);
-                        if(cTrailCC>myTrailCC) {
-                            fprintf(stderr,
-                                    "gennorm2 error: "
-                                    "U+%04lX's round-trip mapping's starter "
-                                    "U+%04lX decomposes and the "
-                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
-                                    "not possible in Unicode normalization\n",
-                                    (long)start, (long)c,
-                                    (short)cTrailCC, (short)myTrailCC);
-                            exit(U_INVALID_FORMAT_ERROR);
-                        }
-                    } else {
-                        fprintf(stderr,
-                                "gennorm2 error: "
-                                "U+%04lX's round-trip mapping's non-starter "
-                                "U+%04lX decomposes, "
-                                "not possible in Unicode normalization\n",
-                                (long)start, (long)c);
-                        exit(U_INVALID_FORMAT_ERROR);
-                    }
-                }
-                if(decomposed==NULL) {
-                    decomposed=new UnicodeString(m, 0, prev);
-                }
-                decomposed->append(*cNorm.mapping);
-            } else if(Hangul::isHangul(c)) {
-                UChar buffer[3];
-                int32_t hangulLength=Hangul::decompose(c, buffer);
-                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
-                    fprintf(stderr,
-                            "gennorm2 error: "
-                            "U+%04lX's round-trip mapping's non-starter "
-                            "U+%04lX decomposes, "
-                            "not possible in Unicode normalization\n",
-                            (long)start, (long)c);
-                    exit(U_INVALID_FORMAT_ERROR);
-                }
-                if(decomposed==NULL) {
-                    decomposed=new UnicodeString(m, 0, prev);
-                }
-                decomposed->append(buffer, hangulLength);
-            } else if(decomposed!=NULL) {
-                decomposed->append(m, prev, i-prev);
-            }
-        }
-        if(decomposed!=NULL) {
-            if(norm.rawMapping==NULL) {
-                // Remember the original mapping when decomposing recursively.
-                norm.rawMapping=norm.mapping;
-            } else {
-                delete norm.mapping;
-            }
-            norm.mapping=decomposed;
-            // Not  norm.setMappingCP();  because the original mapping
-            // is most likely to be encodable as a delta.
-            return TRUE;
-        }
-    }
-    return FALSE;
-}
-
-class BuilderReorderingBuffer {
-public:
-    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
-    void reset() {
-        fLength=0;
-        fLastStarterIndex=-1;
-        fDidReorder=FALSE;
-    }
-    int32_t length() const { return fLength; }
-    UBool isEmpty() const { return fLength==0; }
-    int32_t lastStarterIndex() const { return fLastStarterIndex; }
-    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
-    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
-    UBool didReorder() const { return fDidReorder; }
-    void append(UChar32 c, uint8_t cc) {
-        if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
-            if(cc==0) {
-                fLastStarterIndex=fLength;
-            }
-            fArray[fLength++]=(c<<8)|cc;
-            return;
-        }
-        // Let this character bubble back to its canonical order.
-        int32_t i=fLength-1;
-        while(i>fLastStarterIndex && ccAt(i)>cc) {
-            --i;
-        }
-        ++i;  // after the last starter or prevCC<=cc
-        // Move this and the following characters forward one to make space.
-        for(int32_t j=fLength; i<j; --j) {
-            fArray[j]=fArray[j-1];
-        }
-        fArray[i]=(c<<8)|cc;
-        ++fLength;
-        fDidReorder=TRUE;
-    }
-    void toString(UnicodeString &dest) {
-        dest.remove();
-        for(int32_t i=0; i<fLength; ++i) {
-            dest.append(charAt(i));
-        }
-    }
-    void setComposite(UChar32 composite, int32_t combMarkIndex) {
-        fArray[fLastStarterIndex]=composite<<8;
-        // Remove the combining mark that contributed to the composite.
-        --fLength;
-        while(combMarkIndex<fLength) {
-            fArray[combMarkIndex]=fArray[combMarkIndex+1];
-            ++combMarkIndex;
-        }
-    }
-private:
-    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
-    int32_t fLength;
-    int32_t fLastStarterIndex;
-    UBool fDidReorder;
-};
-
-void
-Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
-    UnicodeString &m=*p->mapping;
-    int32_t length=m.length();
-    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
-        return;  // writeMapping() will complain about it and print the code point.
-    }
-    const UChar *s=toUCharPtr(m.getBuffer());
-    int32_t i=0;
-    UChar32 c;
-    while(i<length) {
-        U16_NEXT(s, i, length, c);
-        buffer.append(c, getCC(c));
-    }
-    if(buffer.didReorder()) {
-        buffer.toString(m);
-    }
-}
-
-/*
- * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
- * A starter character with a mapping does not have a composition boundary after it
- * if the character itself combines-forward (which is tested by the caller of this function),
- * or it is deleted (mapped to the empty string),
- * or its mapping contains no starter,
- * or the last starter combines-forward.
- */
 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
     if(buffer.isEmpty()) {
         return TRUE;  // maps-to-empty-string is no boundary of any kind
@@ -635,15 +216,14 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
     }
     UChar32 starter=buffer.charAt(lastStarterIndex);
     if( Hangul::isJamoL(starter) ||
-        (Hangul::isJamoV(starter) &&
-         0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
-    ) {
+            (Hangul::isJamoV(starter) &&
+            0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
         // otherwise it is blocked.
         return lastStarterIndex==buffer.length()-1;
     }
     // Note: There can be no Hangul syllable in the fully decomposed mapping.
-    const Norm *starterNorm=&getNormRef(starter);
+    const Norm *starterNorm=&norms.getNormRef(starter);
     if(starterNorm->compositions==NULL) {
         return FALSE;  // the last starter does not combine forward
     }
@@ -651,17 +231,17 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
     uint8_t prevCC=0;
     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
-        if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
+        if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
             return TRUE;
         }
-        if( prevCC<cc &&
-            (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
-        ) {
+        if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
             buffer.setComposite(starter, combMarkIndex);
-            starterNorm=&getNormRef(starter);
+            starterNorm=&norms.getNormRef(starter);
             if(starterNorm->compositions==NULL) {
                 return FALSE;  // the composite does not combine further
             }
+            // The combining mark at combMarkIndex has been removed.
+            // Do not increment combMarkIndex now.
         } else {
             prevCC=cc;
             ++combMarkIndex;
@@ -671,320 +251,66 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
     return prevCC==0;
 }
 
-// Requires p->hasMapping().
-// Returns the offset of the "first unit" from the beginning of the extraData for c.
-// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
-int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
-    UnicodeString &m=*p->mapping;
-    int32_t length=m.length();
-    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
-        fprintf(stderr,
-                "gennorm2 error: "
-                "mapping for U+%04lX longer than maximum of %d\n",
-                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
-        exit(U_INVALID_FORMAT_ERROR);
-    }
-    int32_t leadCC, trailCC;
-    if(length==0) {
-        leadCC=trailCC=0;
-    } else {
-        leadCC=getCC(m.char32At(0));
-        trailCC=getCC(m.char32At(length-1));
-    }
-    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
-        fprintf(stderr,
-                "gennorm2 error: "
-                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
-                (long)c);
-        exit(U_INVALID_FORMAT_ERROR);
-    }
-    // Write small-FCD data.
-    if((leadCC|trailCC)!=0) {
-        UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
-        smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
-    }
-    // Write the mapping & raw mapping extraData.
-    int32_t firstUnit=length|(trailCC<<8);
-    int32_t preMappingLength=0;
-    if(p->rawMapping!=NULL) {
-        UnicodeString &rm=*p->rawMapping;
-        int32_t rmLength=rm.length();
-        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "raw mapping for U+%04lX longer than maximum of %d\n",
-                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        UChar rm0=rm.charAt(0);
-        if( rmLength==length-1 &&
-            // 99: overlong substring lengths get pinned to remainder lengths anyway
-            0==rm.compare(1, 99, m, 2, 99) &&
-            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
-        ) {
-            // Compression:
-            // rawMapping=rm0+mapping.substring(2) -> store only rm0
-            //
-            // The raw mapping is the same as the final mapping after replacing
-            // the final mapping's first two code units with the raw mapping's first one.
-            // In this case, we store only that first unit, rm0.
-            // This helps with a few hundred mappings.
-            dataString.append(rm0);
-            preMappingLength=1;
-        } else {
-            // Store the raw mapping with its length.
-            dataString.append(rm);
-            dataString.append((UChar)rmLength);
-            preMappingLength=rmLength+1;
-        }
-        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
-    }
-    int32_t cccLccc=p->cc|(leadCC<<8);
-    if(cccLccc!=0) {
-        dataString.append((UChar)cccLccc);
-        ++preMappingLength;
-        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
-    }
-    if(p->hasNoCompBoundaryAfter) {
-        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
-    }
-    dataString.append((UChar)firstUnit);
-    dataString.append(m);
-    return preMappingLength;
-}
-
-// Requires p->compositions!=NULL.
-void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
-    if(p->cc!=0) {
-        fprintf(stderr,
-                "gennorm2 error: "
-                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
-                (long)c);
-        exit(U_INVALID_FORMAT_ERROR);
-    }
-    int32_t length;
-    const CompositionPair *pairs=p->getCompositionPairs(length);
-    for(int32_t i=0; i<length; ++i) {
-        const CompositionPair &pair=pairs[i];
-        // 22 bits for the composite character and whether it combines forward.
-        UChar32 compositeAndFwd=pair.composite<<1;
-        if(getNormRef(pair.composite).compositions!=NULL) {
-            compositeAndFwd|=1;  // The composite character also combines-forward.
-        }
-        // Encode most pairs in two units and some in three.
-        int32_t firstUnit, secondUnit, thirdUnit;
-        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
-            if(compositeAndFwd<=0xffff) {
-                firstUnit=pair.trail<<1;
-                secondUnit=compositeAndFwd;
-                thirdUnit=-1;
-            } else {
-                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
-                secondUnit=compositeAndFwd>>16;
-                thirdUnit=compositeAndFwd;
-            }
-        } else {
-            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
-                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
-                      Normalizer2Impl::COMP_1_TRIPLE;
-            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
-                       (compositeAndFwd>>16);
-            thirdUnit=compositeAndFwd;
-        }
-        // Set the high bit of the first unit if this is the last composition pair.
-        if(i==(length-1)) {
-            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
-        }
-        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
-        if(thirdUnit>=0) {
-            dataString.append((UChar)thirdUnit);
-        }
-    }
-}
-
-class ExtraDataWriter : public Normalizer2DBEnumerator {
-public:
-    ExtraDataWriter(Normalizer2DataBuilder &b) :
-        Normalizer2DBEnumerator(b),
-        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
-        yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
-    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        if(value!=0) {
-            if(start!=end) {
-                fprintf(stderr,
-                        "gennorm2 error: unexpected shared data for "
-                        "multiple code points U+%04lX..U+%04lX\n",
-                        (long)start, (long)end);
-                exit(U_INTERNAL_PROGRAM_ERROR);
-            }
-            builder.writeExtraData(start, value, *this);
-        }
-        return TRUE;
-    }
-    UnicodeString maybeYesCompositions;
-    UnicodeString yesYesCompositions;
-    UnicodeString yesNoMappingsAndCompositions;
-    UnicodeString yesNoMappingsOnly;
-    UnicodeString noNoMappings;
-    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
-};
-
-void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
-    Norm *p=norms+value;
-    if(!p->hasMapping()) {
-        // Write small-FCD data.
-        // There is similar code in writeMapping() for characters that do have a mapping.
-        if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
-                    (long)c);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        if(p->cc!=0) {
-            UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
-            smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
-        }
-    }
-    if(p->combinesBack) {
-        if(p->hasMapping()) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
-                    (long)c);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        if(p->compositions!=NULL) {
-            p->offset=
-                (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
-                Norm::OFFSET_MAYBE_YES;
-            writeCompositions(c, p, writer.maybeYesCompositions);
-        }
-    } else if(!p->hasMapping()) {
-        if(p->compositions!=NULL) {
-            p->offset=
-                (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
-                Norm::OFFSET_YES_YES;
-            writeCompositions(c, p, writer.yesYesCompositions);
-        }
-    } else if(p->mappingType==Norm::ROUND_TRIP) {
-        if(p->compositions!=NULL) {
-            int32_t offset=writer.yesNoMappingsAndCompositions.length()+
-                           writeMapping(c, p, writer.yesNoMappingsAndCompositions);
-            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
-            writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
-        } else {
-            int32_t offset=writer.yesNoMappingsOnly.length()+
-                           writeMapping(c, p, writer.yesNoMappingsOnly);
-            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
-        }
-    } else /* one-way */ {
-        if(p->compositions!=NULL) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX combines-forward and has a one-way mapping, "
-                    "not possible in Unicode normalization\n",
-                    (long)c);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
-            // Try a compact, algorithmic encoding.
-            // Only for ccc=0, because we can't store additional information
-            // and we do not recursively follow an algorithmic encoding for access to the ccc.
-            //
-            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
-            // if the mappingCP decomposes further, to ensure that there is a place to store it.
-            // We want to see that the final mapping does not have exactly 1 code point,
-            // or else we would have to recursively ensure that the final mapping is stored
-            // in normal extraData.
-            if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
-                int32_t delta=p->mappingCP-c;
-                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
-                    p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
-                }
-            }
-        }
-        if(p->offset==0) {
-            int32_t oldNoNoLength=writer.noNoMappings.length();
-            int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
-            UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
-            int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
-            if(previousOffset!=0) {
-                // Duplicate, remove the new units and point to the old ones.
-                writer.noNoMappings.truncate(oldNoNoLength);
-                p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
-            } else {
-                // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
-                IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
-                writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
-                p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
-            }
-        }
-    }
-}
-
-class Norm16Writer : public Normalizer2DBEnumerator {
+class Norm16Writer : public Norms::Enumerator {
 public:
-    Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
-    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
-        builder.writeNorm16(start, end, value);
-        return TRUE;
+    Norm16Writer(Norms &n, Normalizer2DataBuilder &b) : Norms::Enumerator(n), builder(b) {}
+    void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override {
+        builder.writeNorm16(start, end, norm);
     }
+    Normalizer2DataBuilder &builder;
 };
 
-void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
-    if(value!=0) {
-        const Norm *p=norms+value;
-        int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
-        int32_t norm16=0;
-        UBool isDecompNo=FALSE;
-        UBool isCompNoMaybe=FALSE;
-        switch(p->offset&Norm::OFFSET_MASK) {
-        case Norm::OFFSET_NONE:
-            // No mapping, no compositions list.
-            if(p->combinesBack) {
-                norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
-                isDecompNo=(UBool)(p->cc!=0);
-                isCompNoMaybe=TRUE;
-            } else if(p->cc!=0) {
-                norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
-                isDecompNo=isCompNoMaybe=TRUE;
-            }
-            break;
-        case Norm::OFFSET_MAYBE_YES:
-            norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
+void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
+    int32_t offset=norm.offset>>Norm::OFFSET_SHIFT;
+    int32_t norm16=0;
+    UBool isDecompNo=FALSE;  // TRUE if need to ensure start>=minDecompNoCP
+    UBool isCompNoMaybe=FALSE;  // TRUE if need to ensure start>=minCompNoMaybeCP
+    switch(norm.offset&Norm::OFFSET_MASK) {
+    case Norm::OFFSET_NONE:
+        // No mapping, no compositions list.
+        if(norm.combinesBack) {
+            norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc;
+            isDecompNo=(UBool)(norm.cc!=0);
             isCompNoMaybe=TRUE;
-            break;
-        case Norm::OFFSET_YES_YES:
-            norm16=offset;
-            break;
-        case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
-            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
-            isDecompNo=TRUE;
-            break;
-        case Norm::OFFSET_YES_NO_MAPPING_ONLY:
-            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
-            isDecompNo=TRUE;
-            break;
-        case Norm::OFFSET_NO_NO:
-            norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
-            isDecompNo=isCompNoMaybe=TRUE;
-            break;
-        case Norm::OFFSET_DELTA:
-            norm16=getCenterNoNoDelta()+offset;
+        } else if(norm.cc!=0) {
+            norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc;
             isDecompNo=isCompNoMaybe=TRUE;
-            break;
-        default:  // Should not occur.
-            exit(U_INTERNAL_PROGRAM_ERROR);
-        }
-        IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
-        utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
-        if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
-            indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
-        }
-        if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
-            indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
         }
+        break;
+    case Norm::OFFSET_MAYBE_YES:
+        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
+        isCompNoMaybe=TRUE;
+        break;
+    case Norm::OFFSET_YES_YES:
+        norm16=offset;
+        break;
+    case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
+        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
+        isDecompNo=TRUE;
+        break;
+    case Norm::OFFSET_YES_NO_MAPPING_ONLY:
+        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
+        isDecompNo=TRUE;
+        break;
+    case Norm::OFFSET_NO_NO:
+        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
+        isDecompNo=isCompNoMaybe=TRUE;
+        break;
+        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
+    case Norm::OFFSET_DELTA:
+        norm16=getCenterNoNoDelta()+offset;
+        isDecompNo=isCompNoMaybe=TRUE;
+        break;
+    default:  // Should not occur.
+        exit(U_INTERNAL_PROGRAM_ERROR);
+    }
+    IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
+    utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
+    if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
+        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
+    }
+    if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
+        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
     }
 }
 
@@ -1041,28 +367,29 @@ void Normalizer2DataBuilder::processData() {
     norm16Trie=utrie2_open(0, 0, errorCode);
     errorCode.assertSuccess();
 
-    utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
+    CompositionBuilder compBuilder(norms);
+    norms.enumRanges(compBuilder);
 
-    Decomposer decomposer(*this);
+    Decomposer decomposer(norms);
     do {
         decomposer.didDecompose=FALSE;
-        utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
+        norms.enumRanges(decomposer);
     } while(decomposer.didDecompose);
 
     BuilderReorderingBuffer buffer;
-    int32_t normsLength=utm_countItems(normMem);
+    int32_t normsLength=norms.length();
     for(int32_t i=1; i<normsLength; ++i) {
         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
         // in Normalizer2Impl::hasCompBoundaryAfter().
         // For details see the comments on hasNoCompBoundaryAfter(buffer).
-        const Norm &norm=norms[i];
+        Norm &norm=norms.getNormRefByIndex(i);
         if(norm.hasMapping()) {
             if(norm.compositions!=NULL) {
-                norms[i].hasNoCompBoundaryAfter=TRUE;
+                norm.hasNoCompBoundaryAfter=TRUE;
             } else {
                 buffer.reset();
-                reorder(norms+i, buffer);
-                norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
+                norms.reorder(norm, buffer);
+                norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
             }
         }
     }
@@ -1070,33 +397,35 @@ void Normalizer2DataBuilder::processData() {
     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
 
-    ExtraDataWriter extraDataWriter(*this);
-    utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
+    ExtraData extra(norms, optimization==OPTIMIZE_FAST);
+    norms.enumRanges(extra);
 
-    extraData=extraDataWriter.maybeYesCompositions;
-    extraData.append(extraDataWriter.yesYesCompositions).
-              append(extraDataWriter.yesNoMappingsAndCompositions).
-              append(extraDataWriter.yesNoMappingsOnly).
-              append(extraDataWriter.noNoMappings);
+    extraData=extra.maybeYesCompositions;
+    extraData.append(extra.yesYesCompositions).
+              append(extra.yesNoMappingsAndCompositions).
+              append(extra.yesNoMappingsOnly).
+              append(extra.noNoMappings);
     // Pad to even length for 4-byte alignment of following data.
     if(extraData.length()&1) {
         extraData.append((UChar)0);
     }
+    memcpy(smallFCD, extra.smallFCD, sizeof(smallFCD));
 
     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
-        extraDataWriter.yesYesCompositions.length();
+        extra.yesYesCompositions.length();
     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
-        extraDataWriter.yesNoMappingsAndCompositions.length();
+        extra.yesNoMappingsAndCompositions.length();
     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
-        extraDataWriter.yesNoMappingsOnly.length();
+        extra.yesNoMappingsOnly.length();
+    // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
-        extraDataWriter.noNoMappings.length();
+        extra.noNoMappings.length();
     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
-        extraDataWriter.maybeYesCompositions.length();
+        extra.maybeYesCompositions.length();
 
     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
@@ -1106,7 +435,8 @@ void Normalizer2DataBuilder::processData() {
         exit(U_BUFFER_OVERFLOW_ERROR);
     }
 
-    utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
+    Norm16Writer norm16Writer(norms, *this);
+    norms.enumRanges(norm16Writer);
 
     setHangulData();
 
@@ -1177,6 +507,7 @@ void Normalizer2DataBuilder::processData() {
         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
+        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
index c231a3d7f61edca9c8c377f4bfb4f4d22d2e1e02..42ec099556dd4acbfbf9bc1c83436803391780cd 100644 (file)
 #include "normalizer2impl.h"  // for IX_COUNT
 #include "toolutil.h"
 #include "utrie2.h"
+#include "norms.h"
 
 U_NAMESPACE_BEGIN
 
 extern UBool beVerbose, haveCopyright;
 
-struct Norm;
-
-class BuilderReorderingBuffer;
-class ExtraDataWriter;
-
 class Normalizer2DataBuilder {
 public:
     Normalizer2DataBuilder(UErrorCode &errorCode);
@@ -69,42 +65,36 @@ public:
     void writeCSourceFile(const char *filename);
 
 private:
-    friend class CompositionBuilder;
-    friend class Decomposer;
-    friend class ExtraDataWriter;
     friend class Norm16Writer;
 
-    // No copy constructor nor assignment operator.
-    Normalizer2DataBuilder(const Normalizer2DataBuilder &other);
-    Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other);
+    Normalizer2DataBuilder(const Normalizer2DataBuilder &other) = delete;
+    Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other) = delete;
 
-    Norm *allocNorm();
-    Norm *getNorm(UChar32 c);
-    Norm *createNorm(UChar32 c);
     Norm *checkNormForMapping(Norm *p, UChar32 c);  // check for permitted overrides
 
-    const Norm &getNormRef(UChar32 c) const;
-    uint8_t getCC(UChar32 c) const;
-    UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
-    UChar32 combine(const Norm &norm, UChar32 trail) const;
-
-    void addComposition(UChar32 start, UChar32 end, uint32_t value);
-    UBool decompose(UChar32 start, UChar32 end, uint32_t value);
-    void reorder(Norm *p, BuilderReorderingBuffer &buffer);
+    /**
+     * Computes the MAPPING_NO_COMP_BOUNDARY_AFTER flag for a character's mapping
+     * (especially for a "YesNo" which has a round-trip mapping).
+     * This flag is used in Normalizer2Impl::hasCompBoundaryAfter().
+     *
+     * Modifies the buffer (partially composes it).
+     *
+     * A starter character with a mapping does not have a composition boundary after it
+     * if the character itself combines-forward (which is tested by the caller of this function),
+     * or it is deleted (mapped to the empty string),
+     * or its mapping contains no starter,
+     * or the last starter combines-forward.
+     */
     UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
     void setHangulData();
-    int32_t writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
-    void writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString);
-    void writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer);
+
     int32_t getCenterNoNoDelta() {
         return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
     }
-    void writeNorm16(UChar32 start, UChar32 end, uint32_t value);
+    void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
     void processData();
 
-    UTrie2 *normTrie;
-    UToolMemory *normMem;
-    Norm *norms;
+    Norms norms;
 
     int32_t phase;
     OverrideHandling overrideHandling;
diff --git a/icu4c/source/tools/gennorm2/norms.cpp b/icu4c/source/tools/gennorm2/norms.cpp
new file mode 100644 (file)
index 0000000..e17c35f
--- /dev/null
@@ -0,0 +1,333 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// norms.cpp
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/errorcode.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "normalizer2impl.h"
+#include "norms.h"
+#include "toolutil.h"
+#include "utrie2.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
+    if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
+        if(cc==0) {
+            fLastStarterIndex=fLength;
+        }
+        fArray[fLength++]=(c<<8)|cc;
+        return;
+    }
+    // Let this character bubble back to its canonical order.
+    int32_t i=fLength-1;
+    while(i>fLastStarterIndex && ccAt(i)>cc) {
+        --i;
+    }
+    ++i;  // after the last starter or prevCC<=cc
+    // Move this and the following characters forward one to make space.
+    for(int32_t j=fLength; i<j; --j) {
+        fArray[j]=fArray[j-1];
+    }
+    fArray[i]=(c<<8)|cc;
+    ++fLength;
+    fDidReorder=TRUE;
+}
+
+void BuilderReorderingBuffer::toString(UnicodeString &dest) {
+    dest.remove();
+    for(int32_t i=0; i<fLength; ++i) {
+        dest.append(charAt(i));
+    }
+}
+
+void BuilderReorderingBuffer::setComposite(UChar32 composite, int32_t combMarkIndex) {
+    fArray[fLastStarterIndex]=composite<<8;
+    // Remove the combining mark that contributed to the composite.
+    --fLength;
+    while(combMarkIndex<fLength) {
+        fArray[combMarkIndex]=fArray[combMarkIndex+1];
+        ++combMarkIndex;
+    }
+}
+
+UChar32 Norm::combine(UChar32 trail) const {
+    int32_t length;
+    const CompositionPair *pairs=getCompositionPairs(length);
+    for(int32_t i=0; i<length; ++i) {
+        if(trail==pairs[i].trail) {
+            return pairs[i].composite;
+        }
+        if(trail<pairs[i].trail) {
+            break;
+        }
+    }
+    return U_SENTINEL;
+}
+
+Norms::Norms(UErrorCode &errorCode) {
+    normTrie=utrie2_open(0, 0, &errorCode);
+    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
+    norms=allocNorm();  // unused Norm struct at index 0
+}
+
+Norms::~Norms() {
+    utrie2_close(normTrie);
+    int32_t normsLength=utm_countItems(normMem);
+    for(int32_t i=1; i<normsLength; ++i) {
+        delete norms[i].mapping;
+        delete norms[i].rawMapping;
+        delete norms[i].compositions;
+    }
+    utm_close(normMem);
+}
+
+Norm *Norms::allocNorm() {
+    Norm *p=(Norm *)utm_alloc(normMem);
+    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
+    return p;
+}
+
+Norm *Norms::getNorm(UChar32 c) {
+    uint32_t i=utrie2_get32(normTrie, c);
+    if(i==0) {
+        return nullptr;
+    }
+    return norms+i;
+}
+
+const Norm &Norms::getNormRef(UChar32 c) const {
+    return norms[utrie2_get32(normTrie, c)];
+}
+
+Norm *Norms::createNorm(UChar32 c) {
+    uint32_t i=utrie2_get32(normTrie, c);
+    if(i!=0) {
+        return norms+i;
+    } else {
+        /* allocate Norm */
+        Norm *p=allocNorm();
+        IcuToolErrorCode errorCode("gennorm2/createNorm()");
+        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
+        return p;
+    }
+}
+
+void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
+    UnicodeString &m=*norm.mapping;
+    int32_t length=m.length();
+    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
+        return;  // writeMapping() will complain about it and print the code point.
+    }
+    const UChar *s=toUCharPtr(m.getBuffer());
+    int32_t i=0;
+    UChar32 c;
+    while(i<length) {
+        U16_NEXT(s, i, length, c);
+        buffer.append(c, getCC(c));
+    }
+    if(buffer.didReorder()) {
+        buffer.toString(m);
+    }
+}
+
+UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const {
+    if((highCC-lowCC)>=2) {
+        int32_t length;
+        const CompositionPair *pairs=norm.getCompositionPairs(length);
+        for(int32_t i=0; i<length; ++i) {
+            uint8_t trailCC=getCC(pairs[i].trail);
+            if(lowCC<trailCC && trailCC<highCC) {
+                return TRUE;
+            }
+        }
+    }
+    return FALSE;
+}
+
+U_CDECL_BEGIN
+
+static UBool U_CALLCONV
+enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
+    return ((Norms::Enumerator *)context)->rangeHandler(start, end, value);
+}
+
+U_CDECL_END
+
+void Norms::enumRanges(Enumerator &e) {
+    utrie2_enum(normTrie, nullptr, enumRangeHandler, &e);
+}
+
+Norms::Enumerator::~Enumerator() {}
+
+UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
+    if(value!=0) {
+        rangeHandler(start, end, norms.getNormRefByIndex(value));
+    }
+    return TRUE;
+}
+
+void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+    if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
+    if(start!=end) {
+        fprintf(stderr,
+                "gennorm2 error: same round-trip mapping for "
+                "more than 1 code point U+%04lX..U+%04lX\n",
+                (long)start, (long)end);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    if(norm.cc!=0) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "U+%04lX has a round-trip mapping and ccc!=0, "
+                "not possible in Unicode normalization\n",
+                (long)start);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    // setRoundTripMapping() ensured that there are exactly two code points.
+    const UnicodeString &m=*norm.mapping;
+    UChar32 lead=m.char32At(0);
+    UChar32 trail=m.char32At(m.length()-1);
+    if(norms.getCC(lead)!=0) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
+                "not possible in Unicode normalization\n",
+                (long)start, (long)lead);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    // Flag for trailing character.
+    norms.createNorm(trail)->combinesBack=TRUE;
+    // Insert (trail, composite) pair into compositions list for the lead character.
+    IcuToolErrorCode errorCode("gennorm2/addComposition()");
+    Norm *leadNorm=norms.createNorm(lead);
+    UVector32 *compositions=leadNorm->compositions;
+    int32_t i;
+    if(compositions==nullptr) {
+        compositions=leadNorm->compositions=new UVector32(errorCode);
+        i=0;  // "insert" the first pair at index 0
+    } else {
+        // Insertion sort, and check for duplicate trail characters.
+        int32_t length;
+        const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
+        for(i=0; i<length; ++i) {
+            if(trail==pairs[i].trail) {
+                fprintf(stderr,
+                        "gennorm2 error: same round-trip mapping for "
+                        "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
+                        (long)start, (long)lead, (long)trail);
+                exit(U_INVALID_FORMAT_ERROR);
+            }
+            if(trail<pairs[i].trail) {
+                break;
+            }
+        }
+    }
+    compositions->insertElementAt(trail, 2*i, errorCode);
+    compositions->insertElementAt(start, 2*i+1, errorCode);
+}
+
+void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
+    if(!norm.hasMapping()) { return; }
+    const UnicodeString &m=*norm.mapping;
+    UnicodeString *decomposed=nullptr;
+    const UChar *s=toUCharPtr(m.getBuffer());
+    int32_t length=m.length();
+    int32_t prev, i=0;
+    UChar32 c;
+    while(i<length) {
+        prev=i;
+        U16_NEXT(s, i, length, c);
+        if(start<=c && c<=end) {
+            fprintf(stderr,
+                    "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
+                    (long)c);
+            exit(U_INVALID_FORMAT_ERROR);
+        }
+        const Norm &cNorm=norms.getNormRef(c);
+        if(cNorm.hasMapping()) {
+            if(norm.mappingType==Norm::ROUND_TRIP) {
+                if(prev==0) {
+                    if(cNorm.mappingType!=Norm::ROUND_TRIP) {
+                        fprintf(stderr,
+                                "gennorm2 error: "
+                                "U+%04lX's round-trip mapping's starter "
+                                "U+%04lX one-way-decomposes, "
+                                "not possible in Unicode normalization\n",
+                                (long)start, (long)c);
+                        exit(U_INVALID_FORMAT_ERROR);
+                    }
+                    uint8_t myTrailCC=norms.getCC(m.char32At(i));
+                    UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
+                    uint8_t cTrailCC=norms.getCC(cTrailChar);
+                    if(cTrailCC>myTrailCC) {
+                        fprintf(stderr,
+                                "gennorm2 error: "
+                                "U+%04lX's round-trip mapping's starter "
+                                "U+%04lX decomposes and the "
+                                "inner/earlier tccc=%hu > outer/following tccc=%hu, "
+                                "not possible in Unicode normalization\n",
+                                (long)start, (long)c,
+                                (short)cTrailCC, (short)myTrailCC);
+                        exit(U_INVALID_FORMAT_ERROR);
+                    }
+                } else {
+                    fprintf(stderr,
+                            "gennorm2 error: "
+                            "U+%04lX's round-trip mapping's non-starter "
+                            "U+%04lX decomposes, "
+                            "not possible in Unicode normalization\n",
+                            (long)start, (long)c);
+                    exit(U_INVALID_FORMAT_ERROR);
+                }
+            }
+            if(decomposed==nullptr) {
+                decomposed=new UnicodeString(m, 0, prev);
+            }
+            decomposed->append(*cNorm.mapping);
+        } else if(Hangul::isHangul(c)) {
+            UChar buffer[3];
+            int32_t hangulLength=Hangul::decompose(c, buffer);
+            if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
+                fprintf(stderr,
+                        "gennorm2 error: "
+                        "U+%04lX's round-trip mapping's non-starter "
+                        "U+%04lX decomposes, "
+                        "not possible in Unicode normalization\n",
+                        (long)start, (long)c);
+                exit(U_INVALID_FORMAT_ERROR);
+            }
+            if(decomposed==nullptr) {
+                decomposed=new UnicodeString(m, 0, prev);
+            }
+            decomposed->append(buffer, hangulLength);
+        } else if(decomposed!=nullptr) {
+            decomposed->append(m, prev, i-prev);
+        }
+    }
+    if(decomposed!=nullptr) {
+        if(norm.rawMapping==nullptr) {
+            // Remember the original mapping when decomposing recursively.
+            norm.rawMapping=norm.mapping;
+        } else {
+            delete norm.mapping;
+        }
+        norm.mapping=decomposed;
+        // Not  norm.setMappingCP();  because the original mapping
+        // is most likely to be encodable as a delta.
+        didDecompose|=TRUE;
+    }
+}
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION
diff --git a/icu4c/source/tools/gennorm2/norms.h b/icu4c/source/tools/gennorm2/norms.h
new file mode 100644 (file)
index 0000000..01aaa22
--- /dev/null
@@ -0,0 +1,178 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// norms.h
+// created: 2017jun04 Markus W. Scherer
+// (pulled out of n2builder.cpp)
+
+// Storing & manipulating Normalizer2 builder data.
+
+#ifndef __NORMS_H__
+#define __NORMS_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/errorcode.h"
+#include "unicode/unistr.h"
+#include "unicode/utf16.h"
+#include "normalizer2impl.h"
+#include "toolutil.h"
+#include "utrie2.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+class BuilderReorderingBuffer {
+public:
+    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
+    void reset() {
+        fLength=0;
+        fLastStarterIndex=-1;
+        fDidReorder=FALSE;
+    }
+    int32_t length() const { return fLength; }
+    UBool isEmpty() const { return fLength==0; }
+    int32_t lastStarterIndex() const { return fLastStarterIndex; }
+    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
+    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
+    UBool didReorder() const { return fDidReorder; }
+
+    void append(UChar32 c, uint8_t cc);
+    void toString(UnicodeString &dest);
+    void setComposite(UChar32 composite, int32_t combMarkIndex);
+
+private:
+    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
+    int32_t fLength;
+    int32_t fLastStarterIndex;
+    UBool fDidReorder;
+};
+
+struct CompositionPair {
+    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
+    UChar32 trail, composite;
+};
+
+struct Norm {
+    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
+
+    UBool hasMapping() const { return mappingType>REMOVED; }
+
+    // Requires hasMapping() and well-formed mapping.
+    void setMappingCP() {
+        UChar32 c;
+        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
+            mappingCP=c;
+        } else {
+            mappingCP=U_SENTINEL;
+        }
+    }
+
+    const CompositionPair *getCompositionPairs(int32_t &length) const {
+        if(compositions==nullptr) {
+            length=0;
+            return nullptr;
+        } else {
+            length=compositions->size()/2;
+            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
+        }
+    }
+    UChar32 combine(UChar32 trail) const;
+
+    UnicodeString *mapping;
+    UnicodeString *rawMapping;  // non-nullptr if the mapping is further decomposed
+    UChar32 mappingCP;  // >=0 if mapping to 1 code point
+    int32_t mappingPhase;
+    MappingType mappingType;
+
+    UVector32 *compositions;  // (trail, composite) pairs
+    uint8_t cc;
+    UBool combinesBack;
+    UBool hasNoCompBoundaryAfter;
+
+    enum OffsetType {
+        OFFSET_NONE,
+        // Composition for back-combining character. Allowed, but not normally used.
+        OFFSET_MAYBE_YES,
+        // Composition for a starter that does not have a decomposition mapping.
+        OFFSET_YES_YES,
+        // Round-trip mapping & composition for a starter.
+        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
+        // Round-trip mapping for a starter that itself does not combine-forward.
+        OFFSET_YES_NO_MAPPING_ONLY,
+        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
+        // One-way mapping.
+        OFFSET_NO_NO,
+        // Delta for an algorithmic one-way mapping.
+        OFFSET_DELTA
+    };
+    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
+    int32_t offset;
+};
+
+class Norms {
+public:
+    Norms(UErrorCode &errorCode);
+    ~Norms();
+
+    int32_t length() const { return utm_countItems(normMem); }
+    const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; }
+    Norm &getNormRefByIndex(int32_t i) { return norms[i]; }
+
+    Norm *allocNorm();
+    /** Returns an existing Norm unit, or nullptr if c has no data. */
+    Norm *getNorm(UChar32 c);
+    /** Returns a Norm unit, creating a new one if necessary. */
+    Norm *createNorm(UChar32 c);
+    /** Returns an existing Norm unit, or an immutable empty object if c has no data. */
+    const Norm &getNormRef(UChar32 c) const;
+    uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
+
+    void reorder(Norm &norm, BuilderReorderingBuffer &buffer) const;
+    UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
+
+    class Enumerator {
+    public:
+        Enumerator(Norms &n) : norms(n) {}
+        virtual ~Enumerator();
+        /** Called for enumerated value!=0. */
+        virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0;
+        /** @internal Public only for C callback. */
+        UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value);
+    protected:
+        Norms &norms;
+    };
+
+    void enumRanges(Enumerator &e);
+
+private:
+    Norms(const Norms &other) = delete;
+    Norms &operator=(const Norms &other) = delete;
+
+    UTrie2 *normTrie;
+    UToolMemory *normMem;
+    Norm *norms;
+};
+
+class CompositionBuilder : public Norms::Enumerator {
+public:
+    CompositionBuilder(Norms &n) : Norms::Enumerator(n) {}
+    /** Adds a composition mapping for the first character in a round-trip mapping. */
+    void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
+};
+
+class Decomposer : public Norms::Enumerator {
+public:
+    Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(FALSE) {}
+    /** Decomposes each character of the current mapping. Sets didDecompose if any. */
+    void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
+    UBool didDecompose;
+};
+
+U_NAMESPACE_END
+
+#endif // #if !UCONFIG_NO_NORMALIZATION
+
+#endif  // __NORMS_H__