]> granicus.if.org Git - icu/commitdiff
ICU-13197 Normalizer2::normalizeUTF8(StringPiece->ByteSink/Edits) compose=direct...
authorMarkus Scherer <markus.icu@gmail.com>
Wed, 31 May 2017 18:15:45 +0000 (18:15 +0000)
committerMarkus Scherer <markus.icu@gmail.com>
Wed, 31 May 2017 18:15:45 +0000 (18:15 +0000)
X-SVN-Rev: 40147

icu4c/source/common/norm2allmodes.h
icu4c/source/common/normalizer2.cpp
icu4c/source/common/normalizer2impl.cpp
icu4c/source/common/normalizer2impl.h
icu4c/source/common/unicode/normalizer2.h
icu4c/source/test/intltest/normconf.cpp
icu4c/source/test/intltest/normconf.h

index 9516817e4aa8f3c198ac6ea9df2def2b5b298a17..91c8634a1f6d8d4a2607c2ea37c71a2299fddfd3 100644 (file)
@@ -18,6 +18,7 @@
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include "unicode/edits.h"
 #include "unicode/normalizer2.h"
 #include "unicode/unistr.h"
 #include "cpputils.h"
@@ -228,6 +229,22 @@ private:
         impl.compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
     }
     using Normalizer2WithImpl::normalize;  // Avoid warning about hiding base class function.
+
+    void
+    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
+                  Edits *edits, UErrorCode &errorCode) const override {
+        if (U_FAILURE(errorCode)) {
+            return;
+        }
+        if (edits != nullptr) {
+            edits->reset();
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
+        impl.composeUTF8(options, s, s + src.length(),
+                         onlyContiguous, TRUE, sink, edits, errorCode);
+        sink.Flush();
+    }
+
     virtual void
     normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
                        UnicodeString &safeMiddle,
index 77f6e27389691d38b35c0dd18f1af2b0d4548c34..efe9117c0ed18ae4ba914e37cc90138fbb4c2a2d 100644 (file)
@@ -20,6 +20,7 @@
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include "unicode/edits.h"
 #include "unicode/normalizer2.h"
 #include "unicode/unistr.h"
 #include "unicode/unorm.h"
@@ -42,6 +43,20 @@ U_NAMESPACE_BEGIN
 
 Normalizer2::~Normalizer2() {}
 
+void
+Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
+                           Edits *edits, UErrorCode &errorCode) const {
+    if (U_FAILURE(errorCode)) {
+        return;
+    }
+    if (edits != nullptr) {
+        errorCode = U_UNSUPPORTED_ERROR;
+        return;
+    }
+    UnicodeString src16 = UnicodeString::fromUTF8(src);
+    normalize(src16, errorCode).toUTF8(sink);
+}
+
 UBool
 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
     return FALSE;
@@ -74,6 +89,19 @@ class NoopNormalizer2 : public Normalizer2 {
         }
         return dest;
     }
+    void
+    normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
+                  Edits *edits, UErrorCode &errorCode) const override {
+        if(U_SUCCESS(errorCode)) {
+            sink.Append(src.data(), src.length());
+            if (edits != nullptr) {
+                edits->reset();
+                edits->addUnchanged(src.length());
+            }
+            sink.Flush();
+        }
+    }
+
     virtual UnicodeString &
     normalizeSecondAndAppend(UnicodeString &first,
                              const UnicodeString &second,
index 67f4c1c25d82d91f8ba61803e29cfbdf34c5f4fd..6a6124f4b756cebfabb810badba9491c0f72c296 100644 (file)
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include "unicode/bytestream.h"
+#include "unicode/edits.h"
 #include "unicode/normalizer2.h"
+#include "unicode/ucasemap.h"  // UCASEMAP_OMIT_UNCHANGED_TEXT
 #include "unicode/udata.h"
 #include "unicode/ustring.h"
 #include "unicode/utf16.h"
+#include "unicode/utf8.h"
 #include "cmemory.h"
 #include "mutex.h"
 #include "normalizer2impl.h"
 
 U_NAMESPACE_BEGIN
 
+namespace {
+
+/**
+ * UTF-8 lead byte for minNoMaybeCP.
+ * Can be lower than the actual lead byte for c.
+ * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
+ */
+inline uint8_t leadByteForCP(UChar32 c) {
+    if (c <= 0x7f) {
+        return (uint8_t)c;
+    } else if (c <= 0x7ff) {
+        return (uint8_t)(0xc0+(c>>6));
+    } else {
+        // Should not occur because ccc(U+0300)!=0.
+        return 0xe0;
+    }
+}
+
+/**
+ * Returns the code point from one single well-formed UTF-8 byte sequence
+ * between src and limit.
+ *
+ * UTrie2 UTF-8 macros do not assemble whole code points (for efficiency).
+ * When we do need the code point, we call this function.
+ * We should not need it for normalization-inert data (norm16==0).
+ * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
+ */
+UChar32 codePointFromValidUTF8(const uint8_t *src, const uint8_t *limit) {
+    // Similar to U8_NEXT_UNSAFE(s, i, c).
+    U_ASSERT(src < limit);
+    uint8_t c = *src;
+    switch(limit-src) {
+    case 1:
+        return c;
+    case 2:
+        return ((c&0x1f)<<6) | (src[1]&0x3f);
+    case 3:
+        // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
+        return (UChar)((c<<12) | ((src[1]&0x3f)<<6) | (src[2]&0x3f));
+    case 4:
+        return ((c&7)<<18) | ((src[1]&0x3f)<<12) | ((src[2]&0x3f)<<6) | (src[3]&0x3f);
+    default:
+        U_ASSERT(FALSE);  // Should not occur.
+        return U_SENTINEL;
+    }
+}
+
+/**
+ * Returns the offset from the Jamo L base if [src, limit[ is a single Jamo L code point.
+ * Otherwise returns a negative value.
+ */
+int32_t getJamoLMinusBase(const uint8_t *src, const uint8_t *limit) {
+    // Jamo L: E1 84 80..92
+    if ((limit - src) == 3 && *src == 0xe1 && src[1] == 0x84) {
+        uint8_t l = src[2] - 0x80;
+        if (l < Hangul::JAMO_L_COUNT) {
+            return l;
+        }
+    }
+    return -1;
+}
+
+/**
+ * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
+ * Otherwise returns a negative value.
+ */
+int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
+    // Jamo T: E1 86 A8..E1 87 82
+    if ((limit - src) >= 3 && *src == 0xe1) {
+        if (src[1] == 0x86) {
+            uint8_t t = src[2];
+            // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
+            // Offset 0 does not correspond to any conjoining Jamo.
+            if (0xa8 <= t && t <= 0xbf) {
+                return t - 0xa7;
+            }
+        } else if (src[1] == 0x87) {
+            uint8_t t = src[2];
+            if ((int8_t)t <= (int8_t)0x82) {
+                return t - (0xa7 - 0x40);
+            }
+        }
+    }
+    return -1;
+}
+
+void giveByteSinkAllocationHint(ByteSink &sink, int32_t desiredCapacity) {
+    char scratch[1];
+    int32_t capacity;
+    sink.GetAppendBuffer(1, desiredCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
+}
+
+/** The bytes at [src, nextSrc[ were mapped to valid (s16, s16Length). */
+UBool
+appendChange(const uint8_t *src, const uint8_t *nextSrc,
+             const char16_t *s16, int32_t s16Length,
+             ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
+    U_ASSERT(U_SUCCESS(errorCode));
+    U_ASSERT((nextSrc - src) <= INT32_MAX);  // ensured by caller
+    char scratch[200];
+    int32_t s8Length = 0;
+    for (int32_t i = 0; i < s16Length;) {
+        int32_t capacity;
+        int32_t desiredCapacity = s16Length - i;
+        if (desiredCapacity < (INT32_MAX / 3)) {
+            desiredCapacity *= 3;  // max 3 UTF-8 bytes per UTF-16 code unit
+        } else if (desiredCapacity < (INT32_MAX / 2)) {
+            desiredCapacity *= 2;
+        } else {
+            desiredCapacity = INT32_MAX;
+        }
+        char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
+                                            scratch, UPRV_LENGTHOF(scratch), &capacity);
+        capacity -= U8_MAX_LENGTH - 1;
+        int32_t j = 0;
+        for (; i < s16Length && j < capacity;) {
+            UChar32 c;
+            U16_NEXT_UNSAFE(s16, i, c);
+            U8_APPEND_UNSAFE(buffer, j, c);
+        }
+        if (j > (INT32_MAX - s8Length)) {
+            errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            return FALSE;
+        }
+        sink.Append(buffer, j);
+        s8Length += j;
+    }
+    if (edits != nullptr) {
+        edits->addReplace((int32_t)(nextSrc - src), s8Length);
+    }
+    return TRUE;
+}
+
+/** The few bytes at [src, nextSrc[ were mapped to valid code point c. */
+void
+appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
+                ByteSink &sink, Edits *edits) {
+    char buffer[U8_MAX_LENGTH];
+    int32_t length = 0;
+    U8_APPEND_UNSAFE(buffer, length, c);
+    if (edits != nullptr) {
+        edits->addReplace((int32_t)(nextSrc - src), length);
+    }
+    sink.Append(buffer, length);
+}
+
+UBool
+appendUnchanged(const uint8_t *s, const uint8_t *limit,
+                ByteSink &sink, uint32_t options, Edits *edits,
+                UErrorCode &errorCode) {
+    U_ASSERT(U_SUCCESS(errorCode));
+    if ((limit - s) > INT32_MAX) {
+        errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+        return FALSE;
+    }
+    int32_t length = (int32_t)(limit - s);
+    if (length > 0) {
+        if (edits != nullptr) {
+            edits->addUnchanged(length);
+        }
+        if ((options & UCASEMAP_OMIT_UNCHANGED_TEXT) ==0) {
+            sink.Append(reinterpret_cast<const char *>(s), length);
+        }
+    }
+    return TRUE;
+}
+
+}  // namespace
+
 // ReorderingBuffer -------------------------------------------------------- ***
 
 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
@@ -69,6 +242,32 @@ UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit)
         0==u_memcmp(start, otherStart, length);
 }
 
+UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
+    U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
+    int32_t length = (int32_t)(limit - start);
+    int32_t otherLength = (int32_t)(otherLimit - otherStart);
+    // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
+    if (otherLength < length || (otherLength / 3) > length) {
+        return FALSE;
+    }
+    // Compare valid strings from between normalization boundaries.
+    // (Invalid sequences are normalization-inert.)
+    for (int32_t i = 0, j = 0;;) {
+        if (i >= length) {
+            return j >= otherLength;
+        } else if (j >= otherLength) {
+            return FALSE;
+        }
+        // Not at the end of either string yet.
+        UChar32 c, other;
+        U16_NEXT_UNSAFE(start, i, c);
+        U8_NEXT_UNSAFE(otherStart, j, other);
+        if (c != other) {
+            return FALSE;
+        }
+    }
+}
+
 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
     if(remainingCapacity<2 && !resize(2, errorCode)) {
         return FALSE;
@@ -615,6 +814,86 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
     }
 }
 
+const uint8_t *
+Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
+                                UBool stopAtCompBoundary, ReorderingBuffer &buffer,
+                                UErrorCode &errorCode) const {
+    if (U_FAILURE(errorCode)) {
+        return nullptr;
+    }
+    // UTF-8 version of decomposeShort() + findNextCompBoundary() together
+    while (src < limit) {
+        const uint8_t *nextSrc = src;
+        uint16_t norm16;
+        UTRIE2_U8_NEXT16(normTrie, nextSrc, limit, norm16);
+        // Get the decomposition and the lead and trail cc's.
+        // Only loops for 1:1 algorithmic mappings.
+        UChar32 c = U_SENTINEL;
+        for (;;) {
+            if (stopAtCompBoundary && isCompYesAndZeroCC(norm16)) {
+                return src;
+            }
+            // norm16!=0 guarantees that [src, nextSrc[ is valid UTF-8.
+            // We do not see invalid UTF-8 here because
+            // its norm16==0 is normalization-inert,
+            // so it gets copied unchanged in the fast path,
+            // and we stop the slow path where invalid UTF-8 begins.
+            U_ASSERT(norm16 != 0);
+            if (isDecompYes(norm16)) {
+                if (c < 0) {
+                    c = codePointFromValidUTF8(src, nextSrc);
+                }
+                // does not decompose
+                if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
+                    return nullptr;
+                }
+                break;
+            } else if (isHangul(norm16)) {
+                // Hangul syllable: decompose algorithmically
+                if (c < 0) {
+                    c = codePointFromValidUTF8(src, nextSrc);
+                }
+                char16_t jamos[3];
+                if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
+                    return nullptr;
+                }
+                break;
+            } else if (isDecompNoAlgorithmic(norm16)) {
+                if (c < 0) {
+                    c = codePointFromValidUTF8(src, nextSrc);
+                }
+                c = mapAlgorithmic(c, norm16);
+                norm16 = getNorm16(c);
+            } else {
+                // The character decomposes, get everything from the variable-length extra data.
+                const uint16_t *mapping = getMapping(norm16);
+                uint16_t firstUnit = *mapping;
+                int32_t length = firstUnit & MAPPING_LENGTH_MASK;
+                uint8_t leadCC;
+                if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
+                    leadCC = (uint8_t)(*(mapping-1) >> 8);
+                } else {
+                    leadCC = 0;
+                }
+                if (stopAtCompBoundary && length != 0 && leadCC == 0) {
+                    int32_t i = 1;  // skip over the firstUnit
+                    U16_NEXT_UNSAFE(mapping, i, c);
+                    if (isCompYesAndZeroCC(getNorm16(c))) {
+                        return src;
+                    }
+                }
+                uint8_t trailCC = (uint8_t)(firstUnit >> 8);
+                if (!buffer.append((const char16_t *)mapping+1, length, leadCC, trailCC, errorCode)) {
+                    return nullptr;
+                }
+                break;
+            }
+        }
+        src = nextSrc;
+    }
+    return src;
+}
+
 const UChar *
 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
     const UChar *decomp=NULL;
@@ -1481,6 +1760,330 @@ void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
     }
 }
 
+namespace {
+
+const int32_t COMP_NO_CP = 0xfffffc00;  // U_SENTINEL << 10 (negative)
+const int32_t COMP_BOUNDARY_BEFORE = 0x200;
+const int32_t COMP_BOUNDARY_AFTER = 0x100;
+
+}  // namespace
+
+/**
+ * Returns composition properties as an int with bit fields.
+ * Bits 31..10: algorithmic-decomp cp if that is compYes, else U_SENTINEL
+ * Bit       9: has boundary before
+ * Bit       8: has boundary after
+ * Bits   7..0: tccc if decompNo, else 0
+ */
+int32_t
+Normalizer2Impl::getCompProps(const uint8_t *src, const uint8_t *limit,
+                              uint16_t norm16, UBool onlyContiguous) const {
+    UChar32 c = U_SENTINEL;
+    for (;;) {
+        if (isInert(norm16)) {
+            return (c << 10) | COMP_BOUNDARY_BEFORE | COMP_BOUNDARY_AFTER;
+        } else if (norm16 <= minYesNo) {
+            int32_t props = COMP_BOUNDARY_BEFORE;
+            // Hangul: norm16==minYesNo
+            // Hangul LVT has a boundary after it.
+            // Hangul LV and non-inert yesYes characters combine forward.
+            if (isHangul(norm16)) {
+                // Do not modify c so that we don't confuse the fast path
+                // for algorithmic decompositions surrounded by boundaries.
+                UChar syllable;
+                if (c >= 0) {
+                    syllable = (UChar)c;
+                } else {
+                    // One branch of codePointFromValidUTF8(src, limit).
+                    U_ASSERT((limit - src) == 3);
+                    syllable = (UChar)((*src<<12) | ((src[1]&0x3f)<<6) | (src[2]&0x3f));
+                }
+                if (!Hangul::isHangulWithoutJamoT(syllable)) {
+                    props |= COMP_BOUNDARY_AFTER;
+                }
+            }
+            return (c << 10) | props;
+        } else if (norm16 >= minMaybeYes) {
+            if (norm16 >= MIN_YES_YES_WITH_CC) {
+                return (c << 10);
+            } else {
+                // Do not return c>=0 for a compMaybe character.
+                return COMP_NO_CP;
+            }
+        } else if (isDecompNoAlgorithmic(norm16)) {
+            if (c < 0) {
+                c = codePointFromValidUTF8(src, limit);
+            }
+            c = mapAlgorithmic(c, norm16);
+            norm16 = getNorm16(c);
+        } else {
+            // c decomposes, get everything from the variable-length extra data.
+            const uint16_t *mapping = getMapping(norm16);
+            uint16_t firstUnit = *mapping;
+            int32_t props = firstUnit >> 8;  // tccc
+            if (norm16 < minNoNo) {
+                props |= (c << 10) | COMP_BOUNDARY_BEFORE;
+            } else {
+                // Do not return c>=0 for a compNo character.
+                props |= COMP_NO_CP;
+                if ((firstUnit & MAPPING_LENGTH_MASK) != 0 &&
+                        ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 ||
+                            (*(mapping-1) & 0xff00) == 0)) {
+                    // The decomposition is not empty, and lccc==0.
+                    int32_t i = 1;  // skip over the firstUnit
+                    UChar32 firstCP;
+                    U16_NEXT_UNSAFE(mapping, i, firstCP);
+                    if (isCompYesAndZeroCC(getNorm16(firstCP))) {
+                        props |= COMP_BOUNDARY_BEFORE;
+                    }
+                }
+            }
+            // comp-boundary-after if
+            //   not MAPPING_NO_COMP_BOUNDARY_AFTER
+            //     (which is set if
+            //       c is not deleted, and
+            //       it and its decomposition do not combine forward, and it has a starter)
+            //   and if FCC then trailCC<=1
+            if ((firstUnit & MAPPING_NO_COMP_BOUNDARY_AFTER) == 0 &&
+                    (!onlyContiguous || firstUnit <= 0x1ff)) {
+                props |= COMP_BOUNDARY_AFTER;
+            }
+            return props;
+        }
+    }
+}
+
+UBool
+Normalizer2Impl::composeUTF8(uint32_t options,
+                             const uint8_t *src, const uint8_t *limit,
+                             UBool onlyContiguous, UBool doCompose,
+                             ByteSink &sink, Edits *edits, UErrorCode &errorCode) const {
+    U_ASSERT(limit != nullptr);
+    uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
+
+    for (;;) {
+        // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
+        // or with (compYes && ccc==0) properties.
+        const uint8_t *prevSrc = src;
+        const uint8_t *nextSrc;
+        uint16_t norm16 = 0;
+        for (;;) {
+            if (src == limit) {
+                if (src != prevSrc && doCompose) {
+                    appendUnchanged(prevSrc, limit, sink, options, edits, errorCode);
+                }
+                return TRUE;
+            }
+            if (*src < minNoMaybeLead) {
+                ++src;
+            } else {
+                nextSrc = src;
+                UTRIE2_U8_NEXT16(normTrie, nextSrc, limit, norm16);
+                if (isCompYesAndZeroCC(norm16)) {
+                    src = nextSrc;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Copy this sequence.
+        // Omit the last character if there is not a boundary between it and the current one.
+        int32_t props = getCompProps(src, nextSrc, norm16, onlyContiguous);
+        int32_t prevProps = 0;
+        if (src != prevSrc) {
+            const uint8_t *p = src;
+            if ((props & COMP_BOUNDARY_BEFORE) == 0) {
+                uint16_t prevNorm16 = 0;
+                UTRIE2_U8_PREV16(normTrie, prevSrc, p, prevNorm16);
+                prevProps = getCompProps(p, src, prevNorm16, onlyContiguous);
+                if (prevProps & COMP_BOUNDARY_AFTER) {
+                    p = src;
+                }
+            }
+            if (p != prevSrc) {
+                if (doCompose) {
+                    if ((limit - prevSrc) <= INT32_MAX) {
+                        // Allocation hint for the full remaining string,
+                        // not just for what we are copying now.
+                        giveByteSinkAllocationHint(sink, (int32_t)(limit - prevSrc));
+                    }
+                    if (!appendUnchanged(prevSrc, p, sink, options, edits, errorCode)) {
+                        break;
+                    }
+                }
+                prevSrc = p;
+            }
+        }
+
+        // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
+        // The current character is either a "noNo" (has a mapping)
+        // or a "maybeYes" (combines backward)
+        // or a "yesYes" with ccc!=0.
+        // It is not a Hangul syllable or Jamo L because those have "yes" properties.
+
+        // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
+        if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
+            if (!doCompose) {
+                return FALSE;
+            }
+            // Fast path for mapping a character that is immediately surrounded by boundaries.
+            // In this case, we need not decompose around the current character
+            // So far, we only do this for algorithmic mappings to a compYes code point;
+            // props>=0 only when this is the case.
+            // Algorithmic mappings are common when the normalization data includes case folding.
+            // If we knew that a full decomposition is composition-normalized
+            // (does not partially recompose),
+            // or if we had the recomposition directly available in the data,
+            // then we could extend this fastpath to such mappings.
+            if (props >= 0 &&  // alg. decomp to compYes
+                    prevSrc == src &&  // has boundary before
+                    ((props & COMP_BOUNDARY_AFTER) || hasCompBoundaryBefore(nextSrc, limit))) {
+                appendCodePoint(src, nextSrc, props >> 10, sink, edits);
+                src = nextSrc;
+                continue;
+            }
+        } else if (isJamoVT(norm16) && prevSrc != src) {
+            // Jamo L: E1 84 80..92
+            // Jamo V: E1 85 A1..B5
+            // Jamo T: E1 86 A8..E1 87 82
+            U_ASSERT((nextSrc - src) == 3 && *src == 0xe1);
+            UChar32 prev;
+            if (src[1] == 0x85) {
+                // The current character is a Jamo Vowel,
+                // compose with previous Jamo L and following Jamo T.
+                if ((prev = getJamoLMinusBase(prevSrc, src)) >= 0) {
+                    if (!doCompose) {
+                        return FALSE;
+                    }
+                    UChar32 syllable = Hangul::HANGUL_BASE +
+                         (prev*Hangul::JAMO_V_COUNT + (src[2]-0xa1)) *
+                         Hangul::JAMO_T_COUNT;
+                    int32_t t = getJamoTMinusBase(nextSrc, limit);
+                    if (t >= 0) {
+                        nextSrc += 3;
+                        syllable += t;  // The next character was a Jamo T.
+                        appendCodePoint(prevSrc, nextSrc, syllable, sink, edits);
+                        src = nextSrc;
+                        continue;
+                    }
+                    // If we see L+V+x where x!=T then we drop to the slow path,
+                    // decompose and recompose.
+                    // This is to deal with NFKC finding normal L and V but a
+                    // compatibility variant of a T.
+                    // We need to either fully compose that combination here
+                    // (which would complicate the code and may not work with strange custom data)
+                    // or use the slow path.
+                }
+            } else if (Hangul::isHangulWithoutJamoT(prev = codePointFromValidUTF8(prevSrc, src))) {
+                // The current character is a Jamo Trailing consonant,
+                // compose with previous Hangul LV that does not contain a Jamo T.
+                if (!doCompose) {
+                    return FALSE;
+                }
+                UChar32 syllable = prev + getJamoTMinusBase(src, nextSrc);
+                appendCodePoint(prevSrc, nextSrc, syllable, sink, edits);
+                src = nextSrc;
+                continue;
+            }
+        } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
+            // One or more combining marks that do not combine-back:
+            // Check for canonical order, copy unchanged if ok and
+            // if followed by a character with a boundary-before.
+            uint8_t cc = (uint8_t)norm16;  // cc!=0
+            if (onlyContiguous /* FCC */ && (uint8_t)prevProps > cc) {
+                // Fails FCD test, need to decompose and contiguously recompose.
+                if (!doCompose) {
+                    return FALSE;
+                }
+            } else {
+                // If !onlyContiguous (not FCC), then we ignore the tccc of
+                // the previous character which passed the quick check "yes && ccc==0" test.
+                const uint8_t *p = nextSrc;
+                const uint8_t *q;
+                uint16_t n16;
+                for (;;) {
+                    if (p == limit) {
+                        if (doCompose) {
+                            appendUnchanged(prevSrc, limit, sink, options, edits, errorCode);
+                        }
+                        return TRUE;
+                    }
+                    uint8_t prevCC = cc;
+                    q = p;
+                    UTRIE2_U8_NEXT16(normTrie, q, limit, n16);
+                    if (n16 >= MIN_YES_YES_WITH_CC) {
+                        cc = (uint8_t)n16;
+                        if (prevCC > cc) {
+                            if (!doCompose) {
+                                return FALSE;
+                            }
+                            break;
+                        }
+                    } else {
+                        break;
+                    }
+                    p = q;
+                }
+                // p is after the last in-order combining mark.
+                // If there is a boundary here, then we copy and continue.
+                // Copy some of hasCompBoundaryBefore() to postpone decoding the code point.
+                if (isCompYesAndZeroCC(n16) ||
+                        (!isMaybeOrNonZeroCC(n16) &&
+                            hasCompBoundaryBefore(codePointFromValidUTF8(p, q), n16))) {
+                    if (doCompose && !appendUnchanged(prevSrc, p, sink, options, edits, errorCode)) {
+                        return TRUE;
+                    }
+                    src = p;
+                    continue;
+                }
+                // Use the slow path. There is no boundary in [nextSrc, p[.
+                nextSrc = p;
+            }
+        }
+
+        // Slow path: Find the nearest boundaries around the current character,
+        // decompose and recompose.
+        // TODO: Inefficient create&destroy of the buffer because
+        // we want to avoid creating one if we do not need it.
+        // Try to make it cheaper, try to use a plain char16_t[] on the stack
+        // until it overflows.
+        // TODO: Port this newer code with Edits support,
+        // and maybe with Appendable style if it does not noticeably hurt UnicodeString performance,
+        // back to UTF-16.
+        // Should be able to remove some then-unnecessary code from ReorderingBuffer.
+        // Might not need findNextCompBoundary() and such any more.
+        UnicodeString s16;
+        ReorderingBuffer buffer(*this, s16);
+        // Decompose the previous (if any) and current characters.
+        // We know there is not a boundary here.
+        decomposeShort(prevSrc, nextSrc, FALSE /* !stopAtCompBoundary */, buffer, errorCode);
+        // Decompose until the next boundary.
+        src = decomposeShort(nextSrc, limit, TRUE /* stopAtCompBoundary */, buffer, errorCode);
+        if (src == nullptr) {  // U_FAILURE
+            break;
+        }
+        if ((src - prevSrc) > INT32_MAX) {
+            errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            return TRUE;
+        }
+        recompose(buffer, 0, onlyContiguous);
+        if (buffer.equals(prevSrc, src)) {
+            if (doCompose && !appendUnchanged(prevSrc, src, sink, options, edits, errorCode)) {
+                break;
+            }
+        } else if (doCompose) {
+            if (!appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
+                              sink, edits, errorCode)) {
+                break;
+            }
+        } else {
+            return TRUE;
+        }
+    }
+    return TRUE;
+}
+
 /**
  * Does c have a composition boundary before it?
  * True if its decomposition begins with a character that has
@@ -1520,7 +2123,7 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
         uint16_t norm16=getNorm16(c);
         if(isInert(norm16)) {
             return TRUE;
-        } else if(norm16<=minYesNo) {
+        } else if(norm16<minYesNoMappingsOnly) {
             // Hangul: norm16==minYesNo
             // Hangul LVT has a boundary after it.
             // Hangul LV and non-inert yesYes characters combine forward.
@@ -1548,6 +2151,19 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
     }
 }
 
+UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
+    if (src == limit) {
+        return FALSE;
+    }
+    const uint8_t *q = src;
+    uint16_t norm16;
+    UTRIE2_U8_NEXT16(normTrie, q, limit, norm16);
+    // Copy some of hasCompBoundaryBefore() to postpone decoding the code point.
+    return isCompYesAndZeroCC(norm16) ||
+        (!isMaybeOrNonZeroCC(norm16) &&
+            hasCompBoundaryBefore(codePointFromValidUTF8(src, q), norm16));
+}
+
 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
     BackwardUTrie2StringIterator iter(normTrie, start, p);
     uint16_t norm16;
index 946abee98f3df10afe5d38b802b23164cc1bb571..d2f64808a266595b18e60c226ba9d9ce9025f949 100644 (file)
@@ -35,6 +35,9 @@ U_NAMESPACE_BEGIN
 
 struct CanonIterData;
 
+class ByteSink;
+class Edits;
+
 class U_COMMON_API Hangul {
 public:
     /* Korean Hangul and Jamo constants */
@@ -135,6 +138,7 @@ public:
     uint8_t getLastCC() const { return lastCC; }
 
     UBool equals(const UChar *start, const UChar *limit) const;
+    UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
 
     // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
     void setLastChar(UChar c) {
@@ -457,6 +461,12 @@ public:
                           UnicodeString &safeMiddle,
                           ReorderingBuffer &buffer,
                           UErrorCode &errorCode) const;
+
+    UBool composeUTF8(uint32_t options,
+                      const uint8_t *src, const uint8_t *limit,
+                      UBool onlyContiguous, UBool doCompose,
+                      ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) const;
+
     const UChar *makeFCD(const UChar *src, const UChar *limit,
                          ReorderingBuffer *buffer, UErrorCode &errorCode) const;
     void makeFCDAndAppend(const UChar *src, const UChar *limit,
@@ -568,12 +578,21 @@ private:
     UBool decompose(UChar32 c, uint16_t norm16,
                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;
 
+    const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
+                                  UBool stopAtCompBoundary, ReorderingBuffer &buffer,
+                                  UErrorCode &errorCode) const;
+
     static int32_t combine(const uint16_t *list, UChar32 trail);
     void addComposites(const uint16_t *list, UnicodeSet &set) const;
     void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
                    UBool onlyContiguous) const;
 
+    int32_t getCompProps(const uint8_t *src, const uint8_t *limit,
+                         uint16_t norm16, UBool onlyContiguous) const;
+
     UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
+    UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
+
     const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
     const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
 
index d326da948a35736f868fb4abfcfe048b135f2b65..bb43439c2ca72f43225ddc89edae6c30a25d8c41 100644 (file)
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include "unicode/stringpiece.h"
 #include "unicode/uniset.h"
 #include "unicode/unistr.h"
 #include "unicode/unorm2.h"
 
 U_NAMESPACE_BEGIN
 
+class ByteSink;
+
 /**
  * Unicode normalization functionality for standard Unicode normalization or
  * for using custom mapping tables.
@@ -215,6 +218,34 @@ public:
     normalize(const UnicodeString &src,
               UnicodeString &dest,
               UErrorCode &errorCode) const = 0;
+
+    /**
+     * Normalizes a UTF-8 string and optionally records how source substrings
+     * relate to changed and unchanged result substrings.
+     *
+     * Currently implemented completely only for "compose" modes,
+     * such as for NFC, NFKC, and NFKC_Casefold
+     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+     * Otherwise currently converts to & from UTF-16 and does not support edits.
+     *
+     * @param options   Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
+     * @param src       Source UTF-8 string.
+     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
+     *                  sink.Flush() is called at the end.
+     * @param edits     Records edits for index mapping, working with styled text,
+     *                  and getting only changes (if any).
+     *                  The Edits contents is undefined if any error occurs.
+     *                  This function calls edits->reset() first. edits can be nullptr.
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @internal ICU 60 technology preview, may be changed or removed in the future
+     */
+    virtual void
+    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
+                  Edits *edits, UErrorCode &errorCode) const;
+
     /**
      * Appends the normalized form of the second string to the first string
      * (merging them at the boundary) and returns the first string.
index e8b4ea218840301cf39613eb1dcf9ec956e8dee0..33c3730ccb328827c87b5c17dca5916c2df6f253 100644 (file)
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include <string>
+#include "unicode/bytestream.h"
 #include "unicode/uchar.h"
+#include "unicode/normalizer2.h"
 #include "unicode/normlzr.h"
 #include "unicode/uniset.h"
 #include "unicode/putil.h"
 #include "cstring.h"
 #include "filestrm.h"
 #include "normconf.h"
+#include "uassert.h"
 #include <stdio.h>
 
-#define CASE(id,test,exec) case id:                          \
-                          name = #test;                 \
-                          if (exec) {                   \
-                              logln(#test "---");       \
-                              logln((UnicodeString)""); \
-                              test();                   \
-                          }                             \
-                          break
-
 void NormalizerConformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) {
-    switch (index) {
-        CASE(0, TestConformance, exec);
-#if !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
-        CASE(1, TestConformance32, exec);
-#endif
-        // CASE(2, TestCase6);
-        default: name = ""; break;
-    }
+    TESTCASE_AUTO_BEGIN;
+    TESTCASE_AUTO(TestConformance);
+    TESTCASE_AUTO(TestConformance32);
+    TESTCASE_AUTO(TestCase6);
+    TESTCASE_AUTO_END;
 }
 
 #define FIELD_COUNT 5
 
 NormalizerConformanceTest::NormalizerConformanceTest() :
-    normalizer(UnicodeString(), UNORM_NFC) {}
+        normalizer(UnicodeString(), UNORM_NFC) {
+    UErrorCode errorCode = U_ZERO_ERROR;
+    nfc = Normalizer2::getNFCInstance(errorCode);
+    nfd = Normalizer2::getNFDInstance(errorCode);
+    nfkc = Normalizer2::getNFKCInstance(errorCode);
+    nfkd = Normalizer2::getNFKDInstance(errorCode);
+    U_ASSERT(U_SUCCESS(errorCode));
+}
 
 NormalizerConformanceTest::~NormalizerConformanceTest() {}
 
@@ -300,56 +299,17 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
                                                   int32_t options,
                                                   UErrorCode &status) {
     UBool pass = TRUE, result;
-    //UErrorCode status = U_ZERO_ERROR;
     UnicodeString out, fcd;
     int32_t fieldNum;
 
     for (int32_t i=0; i<FIELD_COUNT; ++i) {
         fieldNum = i+1;
         if (i<3) {
-            Normalizer::normalize(field[i], UNORM_NFC, options, out, status);
-            if (U_FAILURE(status)) {
-                dataerrln("Error running normalize UNORM_NFC: %s", u_errorName(status));
-            } else {
-                pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c", fieldNum);
-                iterativeNorm(field[i], UNORM_NFC, options, out, +1);
-                pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c", fieldNum);
-                iterativeNorm(field[i], UNORM_NFC, options, out, -1);
-                pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c", fieldNum);
-            }
-
-            Normalizer::normalize(field[i], UNORM_NFD, options, out, status);
-            if (U_FAILURE(status)) {
-                dataerrln("Error running normalize UNORM_NFD: %s", u_errorName(status));
-            } else {
-                pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c", fieldNum);
-                iterativeNorm(field[i], UNORM_NFD, options, out, +1);
-                pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c", fieldNum);
-                iterativeNorm(field[i], UNORM_NFD, options, out, -1);
-                pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c", fieldNum);
-            }
-        }
-        Normalizer::normalize(field[i], UNORM_NFKC, options, out, status);
-        if (U_FAILURE(status)) {
-            dataerrln("Error running normalize UNORM_NFKC: %s", u_errorName(status));
-        } else {
-            pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c", fieldNum);
-            iterativeNorm(field[i], UNORM_NFKC, options, out, +1);
-            pass &= assertEqual("KC(+1)", field[i], out, field[3], "c4!=KC(c", fieldNum);
-            iterativeNorm(field[i], UNORM_NFKC, options, out, -1);
-            pass &= assertEqual("KC(-1)", field[i], out, field[3], "c4!=KC(c", fieldNum);
-        }
-
-        Normalizer::normalize(field[i], UNORM_NFKD, options, out, status);
-        if (U_FAILURE(status)) {
-            dataerrln("Error running normalize UNORM_NFKD: %s", u_errorName(status));
-        } else {
-            pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c", fieldNum);
-            iterativeNorm(field[i], UNORM_NFKD, options, out, +1);
-            pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c", fieldNum);
-            iterativeNorm(field[i], UNORM_NFKD, options, out, -1);
-            pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c", fieldNum);
+            pass &= checkNorm(UNORM_NFC, options, nfc, field[i], field[1], fieldNum);
+            pass &= checkNorm(UNORM_NFD, options, nfd, field[i], field[2], fieldNum);
         }
+        pass &= checkNorm(UNORM_NFKC, options, nfkc, field[i], field[3], fieldNum);
+        pass &= checkNorm(UNORM_NFKD, options, nfkd, field[i], field[4], fieldNum);
     }
     compare(field[1],field[2]);
     compare(field[0],field[1]);
@@ -444,6 +404,66 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
     return pass;
 }
 
+static const char *const kModeStrings[UNORM_MODE_COUNT] = {
+    "?", "D", "KD", "C", "KC", "FCD"
+};
+
+static const char *const kMessages[UNORM_MODE_COUNT] = {
+    "?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
+};
+
+UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t options,
+                                           const Normalizer2 *norm2,
+                                           const UnicodeString &s, const UnicodeString &exp,
+                                           int32_t field) {
+    const char *modeString = kModeStrings[mode];
+    char msg[20];
+    snprintf(msg, sizeof(msg), kMessages[mode], field);
+    UnicodeString out;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    Normalizer::normalize(s, mode, options, out, errorCode);
+    if (U_FAILURE(errorCode)) {
+        dataerrln("Error running normalize UNORM_NF%s: %s", modeString, u_errorName(errorCode));
+        return FALSE;
+    }
+    if (!assertEqual(modeString, "", s, out, exp, msg)) {
+        return FALSE;
+    }
+
+    iterativeNorm(s, mode, options, out, +1);
+    if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
+        return FALSE;
+    }
+
+    iterativeNorm(s, mode, options, out, -1);
+    if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
+        return FALSE;
+    }
+
+    if (norm2 == nullptr || options != 0) {
+        return TRUE;
+    }
+
+    std::string s8;
+    s.toUTF8String(s8);
+    std::string exp8;
+    exp.toUTF8String(exp8);
+    std::string out8;
+    StringByteSink<std::string> sink(&out8);
+    norm2->normalizeUTF8(0, s8, sink, nullptr, errorCode);
+    if (U_FAILURE(errorCode)) {
+        errln("Normalizer2.%s.normalizeUTF8(%s) failed: %s",
+              modeString, s8.c_str(), u_errorName(errorCode));
+        return FALSE;
+    }
+    if (out8 != exp8) {
+        errln("Normalizer2.%s.normalizeUTF8(%s)=%s != %s",
+              modeString, s8.c_str(), out8.c_str(), exp8.c_str());
+        return FALSE;
+    }
+    return TRUE;
+}
+
 /**
  * Do a normalization using the iterative API in the given direction.
  * @param dir either +1 or -1
@@ -475,21 +495,11 @@ void NormalizerConformanceTest::iterativeNorm(const UnicodeString& str,
     }
 }
 
-/**
- * @param op name of normalization form, e.g., "KC"
- * @param s string being normalized
- * @param got value received
- * @param exp expected value
- * @param msg description of this test
- * @param return true if got == exp
- */
-UBool NormalizerConformanceTest::assertEqual(const char *op,
+UBool NormalizerConformanceTest::assertEqual(const char *op, const char *op2,
                                              const UnicodeString& s,
                                              const UnicodeString& got,
                                              const UnicodeString& exp,
-                                             const char *msg,
-                                             int32_t field)
-{
+                                             const char *msg) {
     if (exp == got)
         return TRUE;
 
@@ -509,7 +519,7 @@ UBool NormalizerConformanceTest::assertEqual(const char *op,
     expPretty.extract(0, expPretty.length(), expChars, expPretty.length() + 1);
     expChars[expPretty.length()] = 0;
 
-    errln("    %s%d)%s(%s)=%s, exp. %s", msg, field, op, sChars, gotChars, expChars);
+    errln("    %s: %s%s(%s)=%s, exp. %s", msg, op, op2, sChars, gotChars, expChars);
 
     delete []sChars;
     delete []gotChars;
index aa37711bae4a298c8533eb3703556a9c1faebfae..9e5fea762c5e6bb665bbc9b5149d70e669ea5965 100644 (file)
@@ -14,6 +14,7 @@
 
 #if !UCONFIG_NO_NORMALIZATION
 
+#include "unicode/normalizer2.h"
 #include "unicode/normlzr.h"
 #include "intltest.h"
 
@@ -21,6 +22,7 @@ typedef struct _FileStream FileStream;
 
 class NormalizerConformanceTest : public IntlTest {
     Normalizer normalizer;
+    const Normalizer2 *nfc, *nfd, *nfkc, *nfkd;
 
  public:
     NormalizerConformanceTest();
@@ -63,6 +65,11 @@ class NormalizerConformanceTest : public IntlTest {
                            int32_t options,
                            UErrorCode &status);
 
+    UBool checkNorm(UNormalizationMode mode, int32_t options,
+                    const Normalizer2 *norm2,
+                    const UnicodeString &s, const UnicodeString &exp,
+                    int32_t field);
+
     void iterativeNorm(const UnicodeString& str,
                        UNormalizationMode mode, int32_t options,
                        UnicodeString& result,
@@ -70,18 +77,18 @@ class NormalizerConformanceTest : public IntlTest {
 
     /**
      * @param op name of normalization form, e.g., "KC"
+     * @param op2 name of test case variant, e.g., "(-1)"
      * @param s string being normalized
      * @param got value received
      * @param exp expected value
      * @param msg description of this test
      * @param return true if got == exp
      */
-    UBool assertEqual(const char *op,
+    UBool assertEqual(const char *op, const char *op2,
                       const UnicodeString& s,
                       const UnicodeString& got,
                       const UnicodeString& exp,
-                      const char *msg,
-                      int32_t field);
+                      const char *msg);
 
     /**
      * Split a string into pieces based on the given delimiter