From 19d53e7641c166795bf326aaa1ff40ae60670da4 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 7 Jun 2017 05:23:53 +0000 Subject: [PATCH] ICU-13197 more gennorm2 cleanup, set smallFCD for some characters with algorithmic mappings X-SVN-Rev: 40155 --- icu4c/source/data/in/nfkc_cf.nrm | Bin 51864 -> 51864 bytes icu4c/source/data/in/uts46.nrm | Bin 61060 -> 61060 bytes icu4c/source/tools/gennorm2/extradata.cpp | 129 ++++-------- icu4c/source/tools/gennorm2/extradata.h | 2 - icu4c/source/tools/gennorm2/n2builder.cpp | 237 ++++++++++++++-------- icu4c/source/tools/gennorm2/n2builder.h | 6 +- icu4c/source/tools/gennorm2/norms.cpp | 25 +-- icu4c/source/tools/gennorm2/norms.h | 65 ++++-- 8 files changed, 248 insertions(+), 216 deletions(-) diff --git a/icu4c/source/data/in/nfkc_cf.nrm b/icu4c/source/data/in/nfkc_cf.nrm index d87e9e987c0802ae1781c2be2c396a0268a05237..29abf1330f44d7c266b8b82996e75e1b5a061d88 100644 GIT binary patch delta 15 XcmbO+m3hWg<_$AWGCr6*^W=X3I%Wt* delta 15 XcmbO+m3hWg<_$AWGA@`r^W=X3IVK1P diff --git a/icu4c/source/data/in/uts46.nrm b/icu4c/source/data/in/uts46.nrm index 11f1c299116c5347808971443ae080469a199a32..e3f0dca90c93d45e8a78b84ceccf0858a6b3f7aa 100644 GIT binary patch delta 15 XcmZp<%iMC8c|*%v#s`yI-~I;xKe`Cd delta 15 XcmZp<%iMC8c|*%v#s!mG-~I;xK6(g` diff --git a/icu4c/source/tools/gennorm2/extradata.cpp b/icu4c/source/tools/gennorm2/extradata.cpp index 3a37554dd0e..2c49c7d0417 100644 --- a/icu4c/source/tools/gennorm2/extradata.cpp +++ b/icu4c/source/tools/gennorm2/extradata.cpp @@ -25,14 +25,7 @@ ExtraData::ExtraData(Norms &n, UBool fast) : Norms::Enumerator(n), yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data - optimizeFast(fast) { - memset(smallFCD, 0, sizeof(smallFCD)); -} - -void ExtraData::setSmallFCD(UChar32 c) { - UChar32 lead= c<=0xffff ? c : U16_LEAD(c); - smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); -} + optimizeFast(fast) {} int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { UnicodeString &m=*norm.mapping; @@ -44,26 +37,8 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); exit(U_INVALID_FORMAT_ERROR); } - int32_t leadCC, trailCC; - if(length==0) { - leadCC=trailCC=0; - } else { - leadCC=norms.getCC(m.char32At(0)); - trailCC=norms.getCC(m.char32At(length-1)); - } - if(ccountChar32())) { int32_t delta=norm.mappingCP-c; if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { - norm.offset=(delta<combine(buffer.charAt(combMarkIndex)))>=0) { - buffer.setComposite(starter, combMarkIndex); starterNorm=&norms.getNormRef(starter); if(starterNorm->compositions==NULL) { return FALSE; // the composite does not combine further } - // The combining mark at combMarkIndex has been removed. - // Do not increment combMarkIndex now. + // Keep prevCC because we "removed" the combining mark. } else { prevCC=cc; - ++combMarkIndex; } } // TRUE if the final, forward-combining starter is at the end. return prevCC==0; + // TODO?! prevCC==0 || norms.combinesWithCCBetween(*starterNorm, prevCC, int32_t! 0x100) + // TODO?! actually, should check if it combines with any cc not seen here +} + +void Normalizer2DataBuilder::postProcess(Norm &norm) { + // Prerequisites: Compositions are built, mappings are recursively decomposed. + // Mappings are not yet in canonical order. + // + // This function works on a Norm struct. We do not know which code point(s) map(s) to it. + // Therefore, we cannot compute algorithmic mapping deltas here. + // Error conditions are checked, but printed later when we do know the offending code point. + if(norm.hasMapping()) { + // Ensure canonical order. + BuilderReorderingBuffer buffer; + if(norm.rawMapping!=nullptr) { + norms.reorder(*norm.rawMapping, buffer); + buffer.reset(); + } + norms.reorder(*norm.mapping, buffer); + if(buffer.isEmpty()) { + norm.leadCC=norm.trailCC=0; + } else { + norm.leadCC=buffer.ccAt(0); + norm.trailCC=buffer.ccAt(buffer.length()-1); + } + + // Set the hasNoCompBoundaryAfter flag for use by the last code branch + // in Normalizer2Impl::hasCompBoundaryAfter(). + // For details see the comments on hasNoCompBoundaryAfter(buffer). + if(norm.compositions!=nullptr) { + norm.hasNoCompBoundaryAfter=TRUE; + } else { + norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); + } + + if(norm.combinesBack) { + norm.error="combines-back and decomposes, not possible in Unicode normalization"; + } else if(norm.mappingType==Norm::ROUND_TRIP) { + if(norm.compositions!=NULL) { + norm.type=Norm::YES_NO_COMBINES_FWD; + } else { + norm.type=Norm::YES_NO_MAPPING_ONLY; + } + } else { // one-way mapping + if(norm.compositions!=NULL) { + norm.error="combines-forward and has a one-way mapping, " + "not possible in Unicode normalization"; + } else { + norm.type=Norm::NO_NO; + } + } + } else { // no mapping + norm.leadCC=norm.trailCC=norm.cc; + + if(norm.combinesBack) { + if(norm.compositions!=nullptr) { + // Earlier code checked ccc=0. + norm.type=Norm::MAYBE_YES_COMBINES_FWD; + } else { + norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc + } + } else if(norm.compositions!=nullptr) { + // Earlier code checked ccc=0. + norm.type=Norm::YES_YES_COMBINES_FWD; + } else if(norm.cc!=0) { + norm.type=Norm::YES_YES_WITH_CC; + } else { + norm.type=Norm::INERT; + } + } } class Norm16Writer : public Norms::Enumerator { @@ -260,55 +327,70 @@ public: Normalizer2DataBuilder &builder; }; +void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { + UChar32 lead= c<=0xffff ? c : U16_LEAD(c); + smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); +} + void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) { - int32_t offset=norm.offset>>Norm::OFFSET_SHIFT; - int32_t norm16=0; - UBool isDecompNo=FALSE; // TRUE if need to ensure start>=minDecompNoCP - UBool isCompNoMaybe=FALSE; // TRUE if need to ensure start>=minCompNoMaybeCP - switch(norm.offset&Norm::OFFSET_MASK) { - case Norm::OFFSET_NONE: - // No mapping, no compositions list. - if(norm.combinesBack) { - norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc; - isDecompNo=(UBool)(norm.cc!=0); - isCompNoMaybe=TRUE; - } else if(norm.cc!=0) { - norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc; - isDecompNo=isCompNoMaybe=TRUE; + if(start=Norm::NO_NO; if(isCompNoMaybe && startminNoNoDelta) { fprintf(stderr, @@ -435,6 +500,13 @@ void Normalizer2DataBuilder::processData() { exit(U_BUFFER_OVERFLOW_ERROR); } + // writeNorm16() and setHangulData() reduce these as needed. + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; + + // Map each code point to its norm16 value, + // including the properties that fit directly, + // and the offset to the "extra data" if necessary. Norm16Writer norm16Writer(norms, *this); norms.enumRanges(norm16Writer); @@ -442,7 +514,7 @@ void Normalizer2DataBuilder::processData() { // Look for the "worst" norm16 value of any supplementary code point // corresponding to a lead surrogate, and set it as that surrogate's value. - // Enables quick check inner loops to look at only code units. + // Enables UTF-16 quick check inner loops to look at only code units. // // We could be more sophisticated: // We could collect a bit set for whether there are values in the different @@ -605,13 +677,6 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) { line, smallFCD, 8, sizeof(smallFCD), "\n};\n\n"); - /*fputs( // TODO - "static const UCaseProps %s_singleton={\n" - " NULL,\n" - " %s_indexes,\n" - " %s_extraData,\n" - " %s_smallFCD,\n", - f);*/ sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); char line2[100]; sprintf(line2, "%s_trieIndex", dataName.data()); diff --git a/icu4c/source/tools/gennorm2/n2builder.h b/icu4c/source/tools/gennorm2/n2builder.h index 42ec099556d..130a0bd6e6e 100644 --- a/icu4c/source/tools/gennorm2/n2builder.h +++ b/icu4c/source/tools/gennorm2/n2builder.h @@ -85,13 +85,15 @@ private: * or its mapping contains no starter, * or the last starter combines-forward. */ - UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer); - void setHangulData(); + UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer); + void postProcess(Norm &norm); + void setSmallFCD(UChar32 c); int32_t getCenterNoNoDelta() { return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1; } void writeNorm16(UChar32 start, UChar32 end, Norm &norm); + void setHangulData(); void processData(); Norms norms; diff --git a/icu4c/source/tools/gennorm2/norms.cpp b/icu4c/source/tools/gennorm2/norms.cpp index e17c35fba8e..7ea93778d89 100644 --- a/icu4c/source/tools/gennorm2/norms.cpp +++ b/icu4c/source/tools/gennorm2/norms.cpp @@ -43,23 +43,13 @@ void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { fDidReorder=TRUE; } -void BuilderReorderingBuffer::toString(UnicodeString &dest) { +void BuilderReorderingBuffer::toString(UnicodeString &dest) const { dest.remove(); for(int32_t i=0; itype=Norm::INERT; } Norms::~Norms() { @@ -122,13 +114,12 @@ Norm *Norms::createNorm(UChar32 c) { } } -void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const { - UnicodeString &m=*norm.mapping; - int32_t length=m.length(); +void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const { + int32_t length=mapping.length(); if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { return; // writeMapping() will complain about it and print the code point. } - const UChar *s=toUCharPtr(m.getBuffer()); + const char16_t *s=mapping.getBuffer(); int32_t i=0; UChar32 c; while(i