From 19d53e7641c166795bf326aaa1ff40ae60670da4 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Wed, 7 Jun 2017 05:23:53 +0000
Subject: [PATCH] ICU-13197 more gennorm2 cleanup, set smallFCD for some
 characters with algorithmic mappings

X-SVN-Rev: 40155
---
 icu4c/source/data/in/nfkc_cf.nrm          | Bin 51864 -> 51864 bytes
 icu4c/source/data/in/uts46.nrm            | Bin 61060 -> 61060 bytes
 icu4c/source/tools/gennorm2/extradata.cpp | 129 ++++--------
 icu4c/source/tools/gennorm2/extradata.h   |   2 -
 icu4c/source/tools/gennorm2/n2builder.cpp | 237 ++++++++++++++--------
 icu4c/source/tools/gennorm2/n2builder.h   |   6 +-
 icu4c/source/tools/gennorm2/norms.cpp     |  25 +--
 icu4c/source/tools/gennorm2/norms.h       |  65 ++++--
 8 files changed, 248 insertions(+), 216 deletions(-)

diff --git a/icu4c/source/data/in/nfkc_cf.nrm b/icu4c/source/data/in/nfkc_cf.nrm
index d87e9e987c0802ae1781c2be2c396a0268a05237..29abf1330f44d7c266b8b82996e75e1b5a061d88 100644
GIT binary patch
delta 15
XcmbO+m3hWg<_$AWGCr6*^W=X3I%Wt*

delta 15
XcmbO+m3hWg<_$AWGA@`r^W=X3IVK1P

diff --git a/icu4c/source/data/in/uts46.nrm b/icu4c/source/data/in/uts46.nrm
index 11f1c299116c5347808971443ae080469a199a32..e3f0dca90c93d45e8a78b84ceccf0858a6b3f7aa 100644
GIT binary patch
delta 15
XcmZp<%iMC8c|*%v#s`yI-~I;xKe`Cd

delta 15
XcmZp<%iMC8c|*%v#s!mG-~I;xK6(g`

diff --git a/icu4c/source/tools/gennorm2/extradata.cpp b/icu4c/source/tools/gennorm2/extradata.cpp
index 3a37554dd0e..2c49c7d0417 100644
--- a/icu4c/source/tools/gennorm2/extradata.cpp
+++ b/icu4c/source/tools/gennorm2/extradata.cpp
@@ -25,14 +25,7 @@ ExtraData::ExtraData(Norms &n, UBool fast) :
         Norms::Enumerator(n),
         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
         yesNoMappingsAndCompositions(1000, (UChar32)0, 1),  // 0=Hangul, 1=start of normal data
-        optimizeFast(fast) {
-    memset(smallFCD, 0, sizeof(smallFCD));
-}
-
-void ExtraData::setSmallFCD(UChar32 c) {
-    UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
-    smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
-}
+        optimizeFast(fast) {}
 
 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
     UnicodeString &m=*norm.mapping;
@@ -44,26 +37,8 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
         exit(U_INVALID_FORMAT_ERROR);
     }
-    int32_t leadCC, trailCC;
-    if(length==0) {
-        leadCC=trailCC=0;
-    } else {
-        leadCC=norms.getCC(m.char32At(0));
-        trailCC=norms.getCC(m.char32At(length-1));
-    }
-    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || leadCC!=0)) {
-        fprintf(stderr,
-                "gennorm2 error: "
-                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
-                (long)c);
-        exit(U_INVALID_FORMAT_ERROR);
-    }
-    // Write small-FCD data.
-    if((leadCC|trailCC)!=0) {
-        setSmallFCD(c);
-    }
     // Write the mapping & raw mapping extraData.
-    int32_t firstUnit=length|(trailCC<<8);
+    int32_t firstUnit=length|(norm.trailCC<<8);
     int32_t preMappingLength=0;
     if(norm.rawMapping!=NULL) {
         UnicodeString &rm=*norm.rawMapping;
@@ -98,7 +73,7 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
         }
         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
     }
-    int32_t cccLccc=norm.cc|(leadCC<<8);
+    int32_t cccLccc=norm.cc|(norm.leadCC<<8);
     if(cccLccc!=0) {
         dataString.append((UChar)cccLccc);
         ++preMappingLength;
@@ -187,65 +162,31 @@ void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
                 (long)start, (long)end);
         exit(U_INTERNAL_PROGRAM_ERROR);
     }
+    if(norm.error!=nullptr) {
+        fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
     writeExtraData(start, norm);
 }
 
 void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
-    if(!norm.hasMapping()) {
-        // Write small-FCD data.
-        // There is similar code in writeMapping() for characters that do have a mapping.
-        if(norm.cc!=0) {
-            if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
-                fprintf(stderr,
-                        "gennorm2 error: "
-                        "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
-                        (long)c);
-                exit(U_INVALID_FORMAT_ERROR);
-            }
-            setSmallFCD(c);
-        }
-    }
-    if(norm.combinesBack) {
-        if(norm.hasMapping()) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
-                    (long)c);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
-        if(norm.compositions!=NULL) {
-            norm.offset=
-                (maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
-                Norm::OFFSET_MAYBE_YES;
-            writeCompositions(c, norm, maybeYesCompositions);
-        }
-    } else if(!norm.hasMapping()) {
-        if(norm.compositions!=NULL) {
-            norm.offset=
-                (yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
-                Norm::OFFSET_YES_YES;
-            writeCompositions(c, norm, yesYesCompositions);
-        }
-    } else if(norm.mappingType==Norm::ROUND_TRIP) {
-        if(norm.compositions!=NULL) {
-            int32_t offset=yesNoMappingsAndCompositions.length()+
-                           writeMapping(c, norm, yesNoMappingsAndCompositions);
-            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
-            writeCompositions(c, norm, yesNoMappingsAndCompositions);
-        } else {
-            int32_t offset=yesNoMappingsOnly.length()+
-                           writeMapping(c, norm, yesNoMappingsOnly);
-            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
-        }
-    } else /* one-way */ {
-        if(norm.compositions!=NULL) {
-            fprintf(stderr,
-                    "gennorm2 error: "
-                    "U+%04lX combines-forward and has a one-way mapping, "
-                    "not possible in Unicode normalization\n",
-                    (long)c);
-            exit(U_INVALID_FORMAT_ERROR);
-        }
+    switch(norm.type) {
+    case Norm::INERT:
+        break;  // no extra data
+    case Norm::YES_YES_COMBINES_FWD:
+        norm.offset=yesYesCompositions.length();
+        writeCompositions(c, norm, yesYesCompositions);
+        break;
+    case Norm::YES_NO_COMBINES_FWD:
+        norm.offset=yesNoMappingsAndCompositions.length()+
+                writeMapping(c, norm, yesNoMappingsAndCompositions);
+        writeCompositions(c, norm, yesNoMappingsAndCompositions);
+        break;
+    case Norm::YES_NO_MAPPING_ONLY:
+        norm.offset=yesNoMappingsOnly.length()+
+                writeMapping(c, norm, yesNoMappingsOnly);
+        break;
+    case Norm::NO_NO:
         if(norm.cc==0 && !optimizeFast) {
             // Try a compact, algorithmic encoding.
             // Only for ccc=0, because we can't store additional information
@@ -260,15 +201,25 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
                     (!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
                 int32_t delta=norm.mappingCP-c;
                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
-                    norm.offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
+                    norm.type=Norm::NO_NO_DELTA;
+                    norm.offset=delta;
+                    break;
                 }
             }
         }
-        if(norm.offset==0) {
-            // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
-            int32_t offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
-            norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
-        }
+        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
+        norm.offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
+        break;
+    case Norm::MAYBE_YES_COMBINES_FWD:
+        norm.offset=maybeYesCompositions.length();
+        writeCompositions(c, norm, maybeYesCompositions);
+        break;
+    case Norm::MAYBE_YES_SIMPLE:
+        break;  // no extra data
+    case Norm::YES_YES_WITH_CC:
+        break;  // no extra data
+    default:  // Should not occur.
+        exit(U_INTERNAL_PROGRAM_ERROR);
     }
 }
 
diff --git a/icu4c/source/tools/gennorm2/extradata.h b/icu4c/source/tools/gennorm2/extradata.h
index f652ed51ec4..cec9d9a94fb 100644
--- a/icu4c/source/tools/gennorm2/extradata.h
+++ b/icu4c/source/tools/gennorm2/extradata.h
@@ -37,10 +37,8 @@ public:
     UnicodeString yesNoMappingsAndCompositions;
     UnicodeString yesNoMappingsOnly;
     UnicodeString noNoMappings;
-    uint8_t smallFCD[0x100];
 
 private:
-    void setSmallFCD(UChar32 c);
     /**
      * Requires norm.hasMapping().
      * Returns the offset of the "first unit" from the beginning of the extraData for c.
diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp
index 8bc5812d129..587f0dc9f46 100644
--- a/icu4c/source/tools/gennorm2/n2builder.cpp
+++ b/icu4c/source/tools/gennorm2/n2builder.cpp
@@ -206,7 +206,7 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
     }
 }
 
-UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
+UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
     if(buffer.isEmpty()) {
         return TRUE;  // maps-to-empty-string is no boundary of any kind
     }
@@ -215,7 +215,7 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
         return TRUE;  // no starter
     }
     UChar32 starter=buffer.charAt(lastStarterIndex);
-    if( Hangul::isJamoL(starter) ||
+    if(Hangul::isJamoL(starter) ||
             (Hangul::isJamoV(starter) &&
             0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
@@ -229,26 +229,93 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
     }
     // Compose as far as possible, and see if further compositions are possible.
     uint8_t prevCC=0;
-    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
+    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
         if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
             return TRUE;
         }
         if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
-            buffer.setComposite(starter, combMarkIndex);
             starterNorm=&norms.getNormRef(starter);
             if(starterNorm->compositions==NULL) {
                 return FALSE;  // the composite does not combine further
             }
-            // The combining mark at combMarkIndex has been removed.
-            // Do not increment combMarkIndex now.
+            // Keep prevCC because we "removed" the combining mark.
         } else {
             prevCC=cc;
-            ++combMarkIndex;
         }
     }
     // TRUE if the final, forward-combining starter is at the end.
     return prevCC==0;
+    // TODO?! prevCC==0 || norms.combinesWithCCBetween(*starterNorm, prevCC, int32_t! 0x100)
+    // TODO?! actually, should check if it combines with any cc not seen here
+}
+
+void Normalizer2DataBuilder::postProcess(Norm &norm) {
+    // Prerequisites: Compositions are built, mappings are recursively decomposed.
+    // Mappings are not yet in canonical order.
+    //
+    // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
+    // Therefore, we cannot compute algorithmic mapping deltas here.
+    // Error conditions are checked, but printed later when we do know the offending code point.
+    if(norm.hasMapping()) {
+        // Ensure canonical order.
+        BuilderReorderingBuffer buffer;
+        if(norm.rawMapping!=nullptr) {
+            norms.reorder(*norm.rawMapping, buffer);
+            buffer.reset();
+        }
+        norms.reorder(*norm.mapping, buffer);
+        if(buffer.isEmpty()) {
+            norm.leadCC=norm.trailCC=0;
+        } else {
+            norm.leadCC=buffer.ccAt(0);
+            norm.trailCC=buffer.ccAt(buffer.length()-1);
+        }
+
+        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
+        // in Normalizer2Impl::hasCompBoundaryAfter().
+        // For details see the comments on hasNoCompBoundaryAfter(buffer).
+        if(norm.compositions!=nullptr) {
+            norm.hasNoCompBoundaryAfter=TRUE;
+        } else {
+            norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
+        }
+
+        if(norm.combinesBack) {
+            norm.error="combines-back and decomposes, not possible in Unicode normalization";
+        } else if(norm.mappingType==Norm::ROUND_TRIP) {
+            if(norm.compositions!=NULL) {
+                norm.type=Norm::YES_NO_COMBINES_FWD;
+            } else {
+                norm.type=Norm::YES_NO_MAPPING_ONLY;
+            }
+        } else {  // one-way mapping
+            if(norm.compositions!=NULL) {
+                norm.error="combines-forward and has a one-way mapping, "
+                           "not possible in Unicode normalization";
+            } else {
+                norm.type=Norm::NO_NO;
+            }
+        }
+    } else {  // no mapping
+        norm.leadCC=norm.trailCC=norm.cc;
+
+        if(norm.combinesBack) {
+            if(norm.compositions!=nullptr) {
+                // Earlier code checked ccc=0.
+                norm.type=Norm::MAYBE_YES_COMBINES_FWD;
+            } else {
+                norm.type=Norm::MAYBE_YES_SIMPLE;  // any ccc
+            }
+        } else if(norm.compositions!=nullptr) {
+            // Earlier code checked ccc=0.
+            norm.type=Norm::YES_YES_COMBINES_FWD;
+        } else if(norm.cc!=0) {
+            norm.type=Norm::YES_YES_WITH_CC;
+        } else {
+            norm.type=Norm::INERT;
+        }
+    }
 }
 
 class Norm16Writer : public Norms::Enumerator {
@@ -260,55 +327,70 @@ public:
     Normalizer2DataBuilder &builder;
 };
 
+void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
+    UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
+    smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
+}
+
 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
-    int32_t offset=norm.offset>>Norm::OFFSET_SHIFT;
-    int32_t norm16=0;
-    UBool isDecompNo=FALSE;  // TRUE if need to ensure start>=minDecompNoCP
-    UBool isCompNoMaybe=FALSE;  // TRUE if need to ensure start>=minCompNoMaybeCP
-    switch(norm.offset&Norm::OFFSET_MASK) {
-    case Norm::OFFSET_NONE:
-        // No mapping, no compositions list.
-        if(norm.combinesBack) {
-            norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc;
-            isDecompNo=(UBool)(norm.cc!=0);
-            isCompNoMaybe=TRUE;
-        } else if(norm.cc!=0) {
-            norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc;
-            isDecompNo=isCompNoMaybe=TRUE;
+    if(start<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || norm.leadCC!=0)) {
+        fprintf(stderr,
+                "gennorm2 error: "
+                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
+                (long)start);
+        exit(U_INVALID_FORMAT_ERROR);
+    }
+    if((norm.leadCC|norm.trailCC)!=0) {
+        for(UChar32 c=start; c<=end; ++c) {
+            setSmallFCD(c);
         }
+    }
+
+    int32_t norm16;
+    switch(norm.type) {
+    case Norm::INERT:
+        norm16=0;
         break;
-    case Norm::OFFSET_MAYBE_YES:
-        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
-        isCompNoMaybe=TRUE;
+    case Norm::YES_YES_COMBINES_FWD:
+        norm16=norm.offset;
         break;
-    case Norm::OFFSET_YES_YES:
-        norm16=offset;
+    case Norm::YES_NO_COMBINES_FWD:
+        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset;
         break;
-    case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
-        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
-        isDecompNo=TRUE;
+    case Norm::YES_NO_MAPPING_ONLY:
+        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset;
         break;
-    case Norm::OFFSET_YES_NO_MAPPING_ONLY:
-        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
-        isDecompNo=TRUE;
+        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
+    case Norm::NO_NO:
+        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset;
         break;
-    case Norm::OFFSET_NO_NO:
-        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
-        isDecompNo=isCompNoMaybe=TRUE;
+    case Norm::NO_NO_DELTA:
+        norm16=getCenterNoNoDelta()+norm.offset;
         break;
-        // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
-    case Norm::OFFSET_DELTA:
-        norm16=getCenterNoNoDelta()+offset;
-        isDecompNo=isCompNoMaybe=TRUE;
+    case Norm::MAYBE_YES_COMBINES_FWD:
+        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset;
+        break;
+    case Norm::MAYBE_YES_SIMPLE:
+        norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc;  // ccc=0..255
+        break;
+    case Norm::YES_YES_WITH_CC:
+        U_ASSERT(norm.cc!=0);
+        norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc;  // ccc=1..255
         break;
     default:  // Should not occur.
         exit(U_INTERNAL_PROGRAM_ERROR);
     }
     IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
     utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
+
+    // Set the minimum code points for real data lookups in the quick check loops.
+    UBool isDecompNo=
+            (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
+            norm.cc!=0;
     if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
     }
+    UBool isCompNoMaybe= norm.type>=Norm::NO_NO;
     if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
     }
@@ -367,66 +449,49 @@ void Normalizer2DataBuilder::processData() {
     norm16Trie=utrie2_open(0, 0, errorCode);
     errorCode.assertSuccess();
 
+    // Build composition lists before recursive decomposition,
+    // so that we still have the raw, pair-wise mappings.
     CompositionBuilder compBuilder(norms);
     norms.enumRanges(compBuilder);
 
+    // Recursively decompose all mappings.
     Decomposer decomposer(norms);
     do {
         decomposer.didDecompose=FALSE;
         norms.enumRanges(decomposer);
     } while(decomposer.didDecompose);
 
-    BuilderReorderingBuffer buffer;
+    // Set the Norm::Type and other properties.
     int32_t normsLength=norms.length();
     for(int32_t i=1; i<normsLength; ++i) {
-        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
-        // in Normalizer2Impl::hasCompBoundaryAfter().
-        // For details see the comments on hasNoCompBoundaryAfter(buffer).
-        Norm &norm=norms.getNormRefByIndex(i);
-        if(norm.hasMapping()) {
-            if(norm.compositions!=NULL) {
-                norm.hasNoCompBoundaryAfter=TRUE;
-            } else {
-                buffer.reset();
-                norms.reorder(norm, buffer);
-                norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
-            }
-        }
+        postProcess(norms.getNormRefByIndex(i));
     }
 
-    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
-    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
-
+    // Write the properties, mappings and composition lists to
+    // appropriate parts of the "extra data" array.
     ExtraData extra(norms, optimization==OPTIMIZE_FAST);
     norms.enumRanges(extra);
 
-    extraData=extra.maybeYesCompositions;
-    extraData.append(extra.yesYesCompositions).
-              append(extra.yesNoMappingsAndCompositions).
-              append(extra.yesNoMappingsOnly).
-              append(extra.noNoMappings);
-    // Pad to even length for 4-byte alignment of following data.
-    if(extraData.length()&1) {
-        extraData.append((UChar)0);
-    }
-    memcpy(smallFCD, extra.smallFCD, sizeof(smallFCD));
-
-    indexes[Normalizer2Impl::IX_MIN_YES_NO]=
-        extra.yesYesCompositions.length();
-    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
-        indexes[Normalizer2Impl::IX_MIN_YES_NO]+
-        extra.yesNoMappingsAndCompositions.length();
-    indexes[Normalizer2Impl::IX_MIN_NO_NO]=
-        indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
-        extra.yesNoMappingsOnly.length();
+    extraData=extra.yesYesCompositions;
+    indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length();
+    extraData.append(extra.yesNoMappingsAndCompositions);
+    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length();
+    extraData.append(extra.yesNoMappingsOnly);
     // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
-    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
-        indexes[Normalizer2Impl::IX_MIN_NO_NO]+
-        extra.noNoMappings.length();
+    indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length();
+    extraData.append(extra.noNoMappings);
+    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length();
+
+    extraData.insert(0, extra.maybeYesCompositions);
     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
         extra.maybeYesCompositions.length();
 
+    // Pad to even length for 4-byte alignment of following data.
+    if(extraData.length()&1) {
+        extraData.append((UChar)0);
+    }
+
     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
         fprintf(stderr,
@@ -435,6 +500,13 @@ void Normalizer2DataBuilder::processData() {
         exit(U_BUFFER_OVERFLOW_ERROR);
     }
 
+    // writeNorm16() and setHangulData() reduce these as needed.
+    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
+    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
+
+    // Map each code point to its norm16 value,
+    // including the properties that fit directly,
+    // and the offset to the "extra data" if necessary.
     Norm16Writer norm16Writer(norms, *this);
     norms.enumRanges(norm16Writer);
 
@@ -442,7 +514,7 @@ void Normalizer2DataBuilder::processData() {
 
     // Look for the "worst" norm16 value of any supplementary code point
     // corresponding to a lead surrogate, and set it as that surrogate's value.
-    // Enables quick check inner loops to look at only code units.
+    // Enables UTF-16 quick check inner loops to look at only code units.
     //
     // We could be more sophisticated:
     // We could collect a bit set for whether there are values in the different
@@ -605,13 +677,6 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
         line,
         smallFCD, 8, sizeof(smallFCD),
         "\n};\n\n");
-    /*fputs(  // TODO
-        "static const UCaseProps %s_singleton={\n"
-        "  NULL,\n"
-        "  %s_indexes,\n"
-        "  %s_extraData,\n"
-        "  %s_smallFCD,\n",
-        f);*/
     sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
     char line2[100];
     sprintf(line2, "%s_trieIndex", dataName.data());
diff --git a/icu4c/source/tools/gennorm2/n2builder.h b/icu4c/source/tools/gennorm2/n2builder.h
index 42ec099556d..130a0bd6e6e 100644
--- a/icu4c/source/tools/gennorm2/n2builder.h
+++ b/icu4c/source/tools/gennorm2/n2builder.h
@@ -85,13 +85,15 @@ private:
      * or its mapping contains no starter,
      * or the last starter combines-forward.
      */
-    UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
-    void setHangulData();
+    UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer);
+    void postProcess(Norm &norm);
 
+    void setSmallFCD(UChar32 c);
     int32_t getCenterNoNoDelta() {
         return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
     }
     void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
+    void setHangulData();
     void processData();
 
     Norms norms;
diff --git a/icu4c/source/tools/gennorm2/norms.cpp b/icu4c/source/tools/gennorm2/norms.cpp
index e17c35fba8e..7ea93778d89 100644
--- a/icu4c/source/tools/gennorm2/norms.cpp
+++ b/icu4c/source/tools/gennorm2/norms.cpp
@@ -43,23 +43,13 @@ void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
     fDidReorder=TRUE;
 }
 
-void BuilderReorderingBuffer::toString(UnicodeString &dest) {
+void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
     dest.remove();
     for(int32_t i=0; i<fLength; ++i) {
         dest.append(charAt(i));
     }
 }
 
-void BuilderReorderingBuffer::setComposite(UChar32 composite, int32_t combMarkIndex) {
-    fArray[fLastStarterIndex]=composite<<8;
-    // Remove the combining mark that contributed to the composite.
-    --fLength;
-    while(combMarkIndex<fLength) {
-        fArray[combMarkIndex]=fArray[combMarkIndex+1];
-        ++combMarkIndex;
-    }
-}
-
 UChar32 Norm::combine(UChar32 trail) const {
     int32_t length;
     const CompositionPair *pairs=getCompositionPairs(length);
@@ -77,7 +67,9 @@ UChar32 Norm::combine(UChar32 trail) const {
 Norms::Norms(UErrorCode &errorCode) {
     normTrie=utrie2_open(0, 0, &errorCode);
     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
-    norms=allocNorm();  // unused Norm struct at index 0
+    // Default "inert" Norm struct at index 0. Practically immutable.
+    norms=allocNorm();
+    norms->type=Norm::INERT;
 }
 
 Norms::~Norms() {
@@ -122,13 +114,12 @@ Norm *Norms::createNorm(UChar32 c) {
     }
 }
 
-void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
-    UnicodeString &m=*norm.mapping;
-    int32_t length=m.length();
+void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
+    int32_t length=mapping.length();
     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
         return;  // writeMapping() will complain about it and print the code point.
     }
-    const UChar *s=toUCharPtr(m.getBuffer());
+    const char16_t *s=mapping.getBuffer();
     int32_t i=0;
     UChar32 c;
     while(i<length) {
@@ -136,7 +127,7 @@ void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
         buffer.append(c, getCC(c));
     }
     if(buffer.didReorder()) {
-        buffer.toString(m);
+        buffer.toString(mapping);
     }
 }
 
diff --git a/icu4c/source/tools/gennorm2/norms.h b/icu4c/source/tools/gennorm2/norms.h
index 01aaa2219ee..a7ee4974b0d 100644
--- a/icu4c/source/tools/gennorm2/norms.h
+++ b/icu4c/source/tools/gennorm2/norms.h
@@ -40,8 +40,7 @@ public:
     UBool didReorder() const { return fDidReorder; }
 
     void append(UChar32 c, uint8_t cc);
-    void toString(UnicodeString &dest);
-    void setComposite(UChar32 composite, int32_t combMarkIndex);
+    void toString(UnicodeString &dest) const;
 
 private:
     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
@@ -88,28 +87,54 @@ struct Norm {
     MappingType mappingType;
 
     UVector32 *compositions;  // (trail, composite) pairs
-    uint8_t cc;
+    uint8_t cc, leadCC, trailCC;
     UBool combinesBack;
     UBool hasNoCompBoundaryAfter;
 
-    enum OffsetType {
-        OFFSET_NONE,
-        // Composition for back-combining character. Allowed, but not normally used.
-        OFFSET_MAYBE_YES,
-        // Composition for a starter that does not have a decomposition mapping.
-        OFFSET_YES_YES,
-        // Round-trip mapping & composition for a starter.
-        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
-        // Round-trip mapping for a starter that itself does not combine-forward.
-        OFFSET_YES_NO_MAPPING_ONLY,
+    /**
+     * Overall type of normalization properties.
+     * Set after most processing is done.
+     *
+     * Corresponds to the rows in the chart on
+     * http://site.icu-project.org/design/normalization/custom
+     * in numerical (but reverse visual) order.
+     *
+     * YES_NO means composition quick check=yes, decomposition QC=no -- etc.
+     */
+    enum Type {
+        /** Initial value until most processing is done. */
+        UNKNOWN,
+        /** No mapping, does not combine, ccc=0. */
+        INERT,
+        /** Starter, no mapping, has compositions. */
+        YES_YES_COMBINES_FWD,
+        /** Starter with a round-trip mapping and compositions. */
+        YES_NO_COMBINES_FWD,
+        /** Starter with a round-trip mapping but no compositions. */
+        YES_NO_MAPPING_ONLY,
         // TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
-        // One-way mapping.
-        OFFSET_NO_NO,
-        // Delta for an algorithmic one-way mapping.
-        OFFSET_DELTA
-    };
-    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
+        /** Has a one-way mapping. */
+        NO_NO,
+        /** Has an algorithmic one-way mapping to a single code point. */
+        NO_NO_DELTA,
+        /**
+         * Combines both backward and forward, has compositions.
+         * Allowed, but not normally used.
+         */
+        MAYBE_YES_COMBINES_FWD,
+        /** Combines only backward. */
+        MAYBE_YES_SIMPLE,
+        /** Non-zero ccc but does not combine backward. */
+        YES_YES_WITH_CC
+    } type;
+    /** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */
     int32_t offset;
+
+    /**
+     * Error string set by processing functions that do not have access
+     * to the code point, deferred for readable reporting.
+     */
+    const char *error;
 };
 
 class Norms {
@@ -130,7 +155,7 @@ public:
     const Norm &getNormRef(UChar32 c) const;
     uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
 
-    void reorder(Norm &norm, BuilderReorderingBuffer &buffer) const;
+    void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;
     UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
 
     class Enumerator {
-- 
2.40.0