]> granicus.if.org Git - icu/commitdiff
ICU-9567 Merged ucol_strcollUTF8 to trunk.
authorYoshito Umaoka <y.umaoka@gmail.com>
Fri, 5 Oct 2012 23:44:00 +0000 (23:44 +0000)
committerYoshito Umaoka <y.umaoka@gmail.com>
Fri, 5 Oct 2012 23:44:00 +0000 (23:44 +0000)
X-SVN-Rev: 32534

icu4c/source/common/unicode/utrace.h
icu4c/source/i18n/ucol.cpp
icu4c/source/i18n/unicode/ucol.h
icu4c/source/test/cintltst/callcoll.c
icu4c/source/test/cintltst/capitst.c

index 3c8be9f7c3452f3de662be2ee3a377bfa2582118..82b0d85e6c4a2b079b8603d2acc9737fae6421f3 100644 (file)
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2003-2006, International Business Machines
+*   Copyright (C) 2003-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -86,7 +86,8 @@ typedef enum UTraceFunctionNumber {
     UTRACE_UCOL_NEXTSORTKEYPART,
     UTRACE_UCOL_STRCOLLITER,
     UTRACE_UCOL_OPEN_FROM_SHORT_STRING,
-    UTRACE_COLLATION_LIMIT
+    UTRACE_COLLATION_LIMIT,
+    UTRACE_UCOL_STRCOLLUTF8
 } UTraceFunctionNumber;
 
 /**
index 9926135a283d7d60cd17dab10ffa296cb4a546e5..ba1d5d4988b26a8459bfae62d19a72baaf01108f 100644 (file)
@@ -25,6 +25,7 @@
 #include "unicode/unorm.h"
 #include "unicode/udata.h"
 #include "unicode/ustring.h"
+#include "unicode/utf8.h"
 
 #include "ucol_imp.h"
 #include "bocsu.h"
@@ -53,10 +54,11 @@ U_NAMESPACE_USE
 
 #define ZERO_CC_LIMIT_            0xC0
 
-// This is static pointer to the NFC implementation instance.
-// it is always the same between calls to u_cleanup
+// These are static pointers to the NFC/NFD implementation instance.
+// Each of them is always the same between calls to u_cleanup
 // and therefore writing to it is not synchronized.
-// It is cleaned in ucol_cleanup
+// They are cleaned in ucol_cleanup
+static const Normalizer2 *g_nfd = NULL;
 static const Normalizer2Impl *g_nfcImpl = NULL;
 
 // These are values from UCA required for
@@ -71,6 +73,7 @@ U_CDECL_BEGIN
 static UBool U_CALLCONV
 ucol_cleanup(void)
 {
+    g_nfd = NULL;
     g_nfcImpl = NULL;
     return TRUE;
 }
@@ -82,6 +85,18 @@ _getFoldingOffset(uint32_t data) {
 
 U_CDECL_END
 
+static inline
+UBool initializeNFD(UErrorCode *status) {
+    if (g_nfd != NULL) {
+        return TRUE;
+    } else {
+        // The result is constant, until the library is reloaded.
+        g_nfd = Normalizer2Factory::getNFDInstance(*status);
+        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
+        return U_SUCCESS(*status);
+    }
+}
+
 // init FCD data
 static inline
 UBool initializeFCD(UErrorCode *status) {
@@ -121,7 +136,11 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
     (s)->offsetReturn = (s)->offsetStore = NULL;
     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
     (s)->coll = (collator);
-    (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
+    if (initializeNFD(status)) {
+        (s)->nfd = g_nfd;
+    } else {
+        return;
+    }
     (s)->fcdPosition = 0;
     if(collator->normalizationMode == UCOL_ON) {
         (s)->flags |= UCOL_ITER_NORM;
@@ -8035,6 +8054,573 @@ endOfSecLoop:
     return UCOL_EQUAL;
 }
 
+/*
+  Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
+  the length of UTF-8 string. This version assumes that the UTF-8 string is null
+  terminated and does not require the length as input.
+
+  Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
+  null terminated input string takes extra amount of CPU cycles.
+*/
+static const UChar32
+utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
+
+#define UTF8_ERROR_VALUE_1 0x15
+#define UTF8_ERROR_VALUE_2 0x9f
+#define UTF_ERROR_VALUE 0xffff
+
+static const UChar32
+utf8_errorValue[6]={
+    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
+    0x3ffffff, 0x7fffffff
+};
+
+static
+UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
+    int32_t i=*pi;
+    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
+    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
+
+    if (c) {
+        uint8_t trail, illegal=0;
+
+        U8_MASK_LEAD_BYTE((c), count);
+        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
+        switch(count) {
+        /* each branch falls through to the next one */
+        case 5:
+        case 4:
+            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
+            illegal=1;
+            break;
+        case 3:
+            trail=s[(i)];
+            if (trail==0) {
+                illegal=1;
+                break;
+            }
+            (c)=((c)<<6)|(trail&0x3f);
+            if(c<0x110) {
+                illegal|=(trail&0xc0)^0x80;
+            } else {
+                /* code point>0x10ffff, outside Unicode */
+                illegal=1;
+                break;
+            }
+            ++(i);
+        case 2:
+            trail=s[(i)];
+            if (trail==0) {
+                illegal=1;
+                break;
+            }
+            (c)=((c)<<6)|(trail&0x3f);
+            illegal|=(trail&0xc0)^0x80;
+            ++(i);
+        case 1:
+            trail=s[(i)];
+            if (trail==0) {
+                illegal=1;
+                break;
+            }
+            (c)=((c)<<6)|(trail&0x3f);
+            illegal|=(trail&0xc0)^0x80;
+            ++(i);
+            break;
+        case 0:
+            if(strict>=0) {
+                return UTF8_ERROR_VALUE_1;
+            } else {
+                return U_SENTINEL;
+            }
+        /* no default branch to optimize switch()  - all values are covered */
+        }
+
+        /*
+         * All the error handling should return a value
+         * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
+         *
+         * Starting with Unicode 3.0.1, non-shortest forms are illegal.
+         * Starting with Unicode 3.2, surrogate code points must not be
+         * encoded in UTF-8, and there are no irregular sequences any more.
+         *
+         * U8_ macros (new in ICU 2.4) return negative values for error conditions.
+         */
+
+        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
+        /* illegal is also set if count>=4 */
+        if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
+            /* error handling */
+            uint8_t errorCount=count;
+            /* don't go beyond this sequence */
+            i=*pi;
+            while(count>0 && U8_IS_TRAIL(s[i])) {
+                ++(i);
+                --count;
+            }
+            if(strict>=0) {
+                c=utf8_errorValue[errorCount-count];
+            } else {
+                c=U_SENTINEL;
+            }
+        } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
+            /* strict: forbid non-characters like U+fffe */
+            c=utf8_errorValue[count];
+        }
+    }
+    *pi=i;
+    return c;
+}
+
+#define U8_NEXT_NULLTERM(s, i, c) { \
+    (c)=(uint8_t)(s)[(i)]; \
+    if((c)>=0x80) { \
+        uint8_t __t1, __t2; \
+        if( /* handle U+1000..U+CFFF inline */ \
+            (0xe0<(c) && (c)<=0xec) && \
+            (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
+            (__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
+        ) { \
+            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
+            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+            (i)+=3; \
+        } else if( /* handle U+0080..U+07FF inline */ \
+            ((c)<0xe0 && (c)>=0xc2) && \
+            (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
+        ) { \
+            (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
+            (i)+=2; \
+        } else if(U8_IS_LEAD(c)) { \
+            /* function call for "complicated" and error cases */ \
+            ++(i); \
+            (c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
+        } else { \
+            (c)=U_SENTINEL; \
+            ++(i); \
+        } \
+    } else { \
+        if ((c)) { \
+            ++(i); \
+        } \
+    } \
+}
+
+#define U8_GET_NULLTERM(s, start, i, c) { \
+    int32_t _u8_get_index=(int32_t)(i); \
+    U8_SET_CP_START(s, start, _u8_get_index); \
+    U8_NEXT_NULLTERM(s, _u8_get_index, c); \
+}
+
+
+static UCollationResult
+ucol_strcollRegularUTF8(
+                    const UCollator *coll,
+                    const char      *source,
+                    int32_t         sourceLength,
+                    const char      *target,
+                    int32_t         targetLength,
+                    UErrorCode      *status)
+{
+    UCharIterator src;
+    UCharIterator tgt;
+
+    uiter_setUTF8(&src, source, sourceLength);
+    uiter_setUTF8(&tgt, target, targetLength);
+
+    // Preparing the context objects for iterating over strings
+    collIterate sColl, tColl;
+    IInit_collIterate(coll, NULL, -1, &sColl, status);
+    IInit_collIterate(coll, NULL, -1, &tColl, status);
+    if(U_FAILURE(*status)) {
+        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
+        return UCOL_EQUAL;
+    }
+    // The division for the array length may truncate the array size to
+    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
+    // for all platforms anyway.
+    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
+    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
+    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
+
+    sColl.iterator = &src;
+    sColl.flags |= UCOL_USE_ITERATOR;
+    tColl.flags |= UCOL_USE_ITERATOR;
+    tColl.iterator = &tgt;
+
+    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
+        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
+        sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
+        sColl.flags &= ~UCOL_ITER_NORM;
+
+        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
+        tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
+        tColl.flags &= ~UCOL_ITER_NORM;
+    }
+
+    return ucol_strcollRegular(&sColl, &tColl, status);
+}
+
+static inline uint32_t
+ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
+                          uint32_t CE, const char *s, int32_t *index, int32_t len)
+{
+    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
+    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
+    int32_t offset = 1;
+    UChar32 schar = 0, tchar = 0;
+
+    for(;;) {
+        if (len == -1) {
+            U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
+            if (schar == 0) {
+                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
+            }
+        } else {
+            if (*index == len) {
+                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
+            }
+            U8_GET((const uint8_t*)s, 0, *index, len, schar);
+        }
+        if (schar == -1) {
+            schar = 0xfffd;
+        }
+
+        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
+            offset++;
+        }
+
+        if (schar == tchar) {
+            U8_FWD_1(s, *index, len);
+            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
+        }
+        else
+        {
+            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
+                return UCOL_BAIL_OUT_CE;
+            }
+            // skip completely ignorables
+            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
+            if(isZeroCE == 0) { // we have to ignore completely ignorables
+                U8_FWD_1(s, *index, len);
+                continue;
+            }
+
+            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
+        }
+    }
+}
+
+static inline UCollationResult
+ucol_strcollUseLatin1UTF8(
+                const UCollator *coll,
+                const char      *source,
+                int32_t         sLen,
+                const char      *target,
+                int32_t         tLen,
+                UErrorCode      *status)
+{
+    U_ALIGN_CODE(16);
+    int32_t strength = coll->strength;
+
+    int32_t sIndex = 0, tIndex = 0;
+    UChar32 sChar = 0, tChar = 0;
+    uint32_t sOrder=0, tOrder=0;
+
+    UBool endOfSource = FALSE;
+
+    uint32_t *elements = coll->latinOneCEs;
+
+    UBool haveContractions = FALSE; // if we have contractions in our string
+                                    // we cannot do French secondary
+
+    // Do the primary level
+    for(;;) {
+        while(sOrder==0) { // this loop skips primary ignorables
+            // sOrder=getNextlatinOneCE(source);
+            if (sLen==-1) {
+                U8_NEXT_NULLTERM(source, sIndex, sChar);
+                if (sChar == 0) {
+                    endOfSource = TRUE;
+                    sLen = sIndex;
+                    break;
+                }
+            } else {
+                if (sIndex == sLen) {
+                    endOfSource = TRUE;
+                    break;
+                }
+                U8_NEXT(source, sIndex, sLen ,sChar);
+            }
+            if (sChar == -1) {
+                sChar = 0xfffd; // fallback for the bad code
+            }
+            if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
+                //fprintf(stderr, "R");
+                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
+            }
+            sOrder = elements[sChar];
+            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
+                // specials can basically be either contractions or bail-out signs. If we get anything
+                // else, we'll bail out anywasy
+                if(getCETag(sOrder) == CONTRACTION_TAG) {
+                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
+                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
+                    // However, if there are contractions in the table, but we always use just one char,
+                    // we might be able to do French. This should be checked out.
+                }
+                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
+                    //fprintf(stderr, "S");
+                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
+                }
+            }
+        }
+
+        while(tOrder==0) {  // this loop skips primary ignorables
+            // tOrder=getNextlatinOneCE(target);
+            if (tLen == -1) {
+                U8_NEXT_NULLTERM(target, tIndex, tChar);
+                if (tChar == 0) {
+                    if(endOfSource) {
+                        tLen = tIndex;
+                        goto endOfPrimLoopU8;
+                    } else {
+                        return UCOL_GREATER;
+                    }
+                }
+            } else {
+                if (tIndex == tLen) {
+                    if(endOfSource) {
+                        goto endOfPrimLoopU8;
+                    } else {
+                        return UCOL_GREATER;
+                    }
+                }
+                U8_NEXT(target, tIndex, tLen, tChar);
+            }
+            if (tChar == -1) {
+                tChar = 0xfffd;
+            }
+            if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
+                //fprintf(stderr, "R");
+                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
+            }
+            tOrder = elements[tChar];
+            if(tOrder >= UCOL_NOT_FOUND) {
+                // Handling specials, see the comments for source
+                if(getCETag(tOrder) == CONTRACTION_TAG) {
+                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
+                    haveContractions = TRUE;
+                }
+                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
+                    //fprintf(stderr, "S");
+                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
+                }
+            }
+        }
+        if(endOfSource) { // source is finished, but target is not, say the result.
+            return UCOL_LESS;
+        }
+
+        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
+            sOrder = 0; tOrder = 0;
+            continue;
+        } else {
+            // compare current top bytes
+            if(((sOrder^tOrder)&0xFF000000)!=0) {
+                // top bytes differ, return difference
+                if(sOrder < tOrder) {
+                    return UCOL_LESS;
+                } else if(sOrder > tOrder) {
+                    return UCOL_GREATER;
+                }
+                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
+                // since we must return enum value
+            }
+
+            // top bytes match, continue with following bytes
+            sOrder<<=8;
+            tOrder<<=8;
+        }
+    }
+
+endOfPrimLoopU8:
+    // after primary loop, we definitely know the sizes of strings,
+    // so we set it and use simpler loop for secondaries and tertiaries
+    sLen = sIndex; tLen = tIndex;
+    if(strength >= UCOL_SECONDARY) {
+        // adjust the table beggining
+        elements += coll->latinOneTableLen;
+        endOfSource = FALSE;
+
+        if(coll->frenchCollation == UCOL_OFF) { // non French
+            // This loop is a simplified copy of primary loop
+            // at this point we know that whole strings are latin-1, so we don't
+            // check for that. We also know that we only have contractions as
+            // specials.
+            sIndex = 0; tIndex = 0;
+            for(;;) {
+                while(sOrder==0) {
+                    if(sIndex==sLen) {
+                        endOfSource = TRUE;
+                        break;
+                    }
+                    U_ASSERT(sLen >= 0);
+                    U8_NEXT(source, sIndex, sLen, sChar);
+                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
+                    sOrder = elements[sChar];
+                    if(sOrder > UCOL_NOT_FOUND) {
+                        sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
+                    }
+                }
+
+                while(tOrder==0) {
+                    if(tIndex==tLen) {
+                        if(endOfSource) {
+                            goto endOfSecLoopU8;
+                        } else {
+                            return UCOL_GREATER;
+                        }
+                    }
+                    U_ASSERT(tLen >= 0);
+                    U8_NEXT(target, tIndex, tLen, tChar);
+                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
+                    tOrder = elements[tChar];
+                    if(tOrder > UCOL_NOT_FOUND) {
+                        tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
+                    }
+                }
+                if(endOfSource) {
+                    return UCOL_LESS;
+                }
+
+                if(sOrder == tOrder) {
+                    sOrder = 0; tOrder = 0;
+                    continue;
+                } else {
+                    // see primary loop for comments on this
+                    if(((sOrder^tOrder)&0xFF000000)!=0) {
+                        if(sOrder < tOrder) {
+                            return UCOL_LESS;
+                        } else if(sOrder > tOrder) {
+                            return UCOL_GREATER;
+                        }
+                    }
+                    sOrder<<=8;
+                    tOrder<<=8;
+                }
+            }
+        } else { // French
+            if(haveContractions) { // if we have contractions, we have to bail out
+                // since we don't really know how to handle them here
+                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
+            }
+            // For French, we go backwards
+            sIndex = sLen; tIndex = tLen;
+            for(;;) {
+                while(sOrder==0) {
+                    if(sIndex==0) {
+                        endOfSource = TRUE;
+                        break;
+                    }
+                    U8_PREV(source, 0, sIndex, sChar);
+                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
+                    sOrder = elements[sChar];
+                    // don't even look for contractions
+                }
+
+                while(tOrder==0) {
+                    if(tIndex==0) {
+                        if(endOfSource) {
+                            goto endOfSecLoopU8;
+                        } else {
+                            return UCOL_GREATER;
+                        }
+                    }
+                    U8_PREV(target, 0, tIndex, tChar);
+                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
+                    tOrder = elements[tChar];
+                    // don't even look for contractions
+                }
+                if(endOfSource) {
+                    return UCOL_LESS;
+                }
+
+                if(sOrder == tOrder) {
+                    sOrder = 0; tOrder = 0;
+                    continue;
+                } else {
+                    // see the primary loop for comments
+                    if(((sOrder^tOrder)&0xFF000000)!=0) {
+                        if(sOrder < tOrder) {
+                            return UCOL_LESS;
+                        } else if(sOrder > tOrder) {
+                            return UCOL_GREATER;
+                        }
+                    }
+                    sOrder<<=8;
+                    tOrder<<=8;
+                }
+            }
+        }
+    }
+
+endOfSecLoopU8:
+    if(strength >= UCOL_TERTIARY) {
+        // tertiary loop is the same as secondary (except no French)
+        elements += coll->latinOneTableLen;
+        sIndex = 0; tIndex = 0;
+        endOfSource = FALSE;
+        for(;;) {
+            while(sOrder==0) {
+                if(sIndex==sLen) {
+                    endOfSource = TRUE;
+                    break;
+                }
+                U_ASSERT(sLen >= 0);
+                U8_NEXT(source, sIndex, sLen, sChar);
+                U_ASSERT(sChar >= 0 && sChar <= 0xFF);
+                sOrder = elements[sChar];
+                if(sOrder > UCOL_NOT_FOUND) {
+                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
+                }
+            }
+            while(tOrder==0) {
+                if(tIndex==tLen) {
+                    if(endOfSource) {
+                        return UCOL_EQUAL; // if both strings are at the end, they are equal
+                    } else {
+                        return UCOL_GREATER;
+                    }
+                }
+                U_ASSERT(tLen >= 0);
+                U8_NEXT(target, tIndex, tLen, tChar);
+                U_ASSERT(tChar >= 0 && tChar <= 0xFF);
+                tOrder = elements[tChar];
+                if(tOrder > UCOL_NOT_FOUND) {
+                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
+                }
+            }
+            if(endOfSource) {
+                return UCOL_LESS;
+            }
+            if(sOrder == tOrder) {
+                sOrder = 0; tOrder = 0;
+                continue;
+            } else {
+                if(((sOrder^tOrder)&0xff000000)!=0) {
+                    if(sOrder < tOrder) {
+                        return UCOL_LESS;
+                    } else if(sOrder > tOrder) {
+                        return UCOL_GREATER;
+                    }
+                }
+                sOrder<<=8;
+                tOrder<<=8;
+            }
+        }
+    }
+    return UCOL_EQUAL;
+}
 
 U_CAPI UCollationResult U_EXPORT2
 ucol_strcollIter( const UCollator    *coll,
@@ -8272,6 +8858,194 @@ ucol_strcoll( const UCollator    *coll,
     return returnVal;
 }
 
+U_DRAFT UCollationResult U_EXPORT2
+ucol_strcollUTF8(
+        const UCollator *coll,
+        const char      *source,
+        int32_t         sourceLength,
+        const char      *target,
+        int32_t         targetLength,
+        UErrorCode      *status)
+{
+    U_ALIGN_CODE(16);
+
+    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
+    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
+        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
+        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
+        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
+    }
+
+    if(source == NULL || target == NULL) {
+        // do not crash, but return. Should have
+        // status argument to return error.
+        UTRACE_EXIT_VALUE(UCOL_EQUAL);
+        return UCOL_EQUAL;
+    }
+
+    /* Quick check if source and target are same strings. */
+    /* They should either both be NULL terminated or the explicit length should be set on both. */
+    if (source==target && sourceLength==targetLength) {
+        UTRACE_EXIT_VALUE(UCOL_EQUAL);
+        return UCOL_EQUAL;
+    }
+
+    // TODO - provider support
+/*
+    if(coll->delegate != NULL) {
+        UErrorCode status = U_ZERO_ERROR;
+        return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
+    }
+*/
+
+    /* Scan the strings.  Find:                                                             */
+    /*    The length of any leading portion that is equal                                   */
+    /*    Whether they are exactly equal.  (in which case we just return)                   */
+    const char  *pSrc = source;
+    const char  *pTarg = target;
+    UBool       bSrcLimit = FALSE;
+    UBool       bTargLimit = FALSE;
+
+    if (sourceLength == -1 && targetLength == -1) {
+        // Both strings are null terminated.
+        //    Scan through any leading equal portion.
+        while (*pSrc == *pTarg && *pSrc != 0) {
+            pSrc++;
+            pTarg++;
+        }
+        if (*pSrc == 0 && *pTarg == 0) {
+            UTRACE_EXIT_VALUE(UCOL_EQUAL);
+            return UCOL_EQUAL;
+        }
+        bSrcLimit = (*pSrc == 0);
+        bTargLimit = (*pTarg == 0);
+    }
+    else
+    {
+        // One or both strings has an explicit length.
+        const char *pSrcEnd = source + sourceLength;
+        const char *pTargEnd = target + targetLength;
+
+        // Scan while the strings are bitwise ==, or until one is exhausted.
+        for (;;) {
+            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
+                break;
+            }
+            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
+                break;
+            }
+            if (*pSrc != *pTarg) {
+                break;
+            }
+            pSrc++;
+            pTarg++;
+        }
+        bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
+        bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
+
+        // If we made it all the way through both strings, we are done.  They are ==
+        if (bSrcLimit &&    /* At end of src string, however it was specified. */
+            bTargLimit)     /* and also at end of dest string                  */
+        {
+            UTRACE_EXIT_VALUE(UCOL_EQUAL);
+            return UCOL_EQUAL;
+        }
+    }
+
+    U_ASSERT(!(bSrcLimit && bTargLimit));
+
+    int32_t    equalLength = pSrc - source;
+    UBool       bSawNonLatin1 = FALSE;
+
+    if (equalLength > 0) {
+        // Align position to the start of UTF-8 code point.
+        if (bTargLimit) {
+            U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
+        } else {
+            U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
+        }
+        pSrc = source + equalLength;
+        pTarg = target + equalLength;
+    }
+
+    if (equalLength > 0) {
+        /* There is an identical portion at the beginning of the two strings.        */
+        /*   If the identical portion ends within a contraction or a comibining      */
+        /*   character sequence, back up to the start of that sequence.              */
+        UBool bUnsafeCP = FALSE;
+        UChar32 uc32 = -1;
+
+        if (!bSrcLimit) {
+            if (sourceLength >= 0) {
+                U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
+            } else {
+                U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
+            }
+            if (uc32 == -1) {
+                uc32 = 0xfffd;
+                bSawNonLatin1 |= TRUE;
+            } else {
+                if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
+                    bUnsafeCP = TRUE;
+                }
+                bSawNonLatin1 |= (uc32 > 0xff);
+            }
+        }
+        if (!bTargLimit) {
+            if (targetLength >= 0) {
+                U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
+            } else {
+                U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
+            }
+            if (uc32 == -1) {
+                uc32 = 0xfffd;
+                bSawNonLatin1 |= TRUE;
+            } else {
+                if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
+                    bUnsafeCP = TRUE;
+                }
+                bSawNonLatin1 |= (uc32 > 0xff);
+            }
+        }
+
+        if (bUnsafeCP) {
+            while (equalLength > 0) {
+                // We are stopped in the middle of a contraction.
+                // Scan backwards through the == part of the string looking for the start of the contraction.
+                //   It doesn't matter which string we scan, since they are the same in this region.
+                U8_PREV((uint8_t*)source, 0, equalLength, uc32);
+                bSawNonLatin1 |= (uc32 > 0xff);
+                if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
+                    break;
+                }
+            }
+        }
+        source += equalLength;
+        target += equalLength;
+        if (sourceLength > 0) {
+            sourceLength -= equalLength;
+        }
+        if (targetLength > 0) {
+            targetLength -= equalLength;
+        }
+    } else {
+        // Lead byte of Latin 1 character is 0x00 - 0xC3
+        bSawNonLatin1 = (source && (sourceLength != 0) && (*source > -61 && *source < 0));
+        bSawNonLatin1 |= (target && (targetLength != 0) && (*target > -61 && *target < 0));
+    }
+
+    UCollationResult returnVal;
+
+    if(!coll->latinOneUse || bSawNonLatin1) {
+        returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
+    } else {
+        returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
+    }
+    UTRACE_EXIT_VALUE(returnVal);
+    return returnVal;
+}
+
+
 /* convenience function for comparing strings */
 U_CAPI UBool U_EXPORT2
 ucol_greater(    const    UCollator        *coll,
index 8580cbe582357fa2c36cfea33d97201abb8e49c1..cb77cd8587b2d865f987ffee6f0ab1379c54adaf 100644 (file)
@@ -533,6 +533,33 @@ ucol_strcoll(    const    UCollator    *coll,
         const    UChar        *target,
         int32_t            targetLength);
 
+/** 
+* Compare two strings in UTF-8. 
+* The strings will be compared using the options already specified. 
+* Note: When input string contains malformed a UTF-8 byte sequence, 
+* this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD).
+* @param coll The UCollator containing the comparison rules. 
+* @param source The source UTF-8 string. 
+* @param sourceLength The length of source, or -1 if null-terminated. 
+* @param target The target UTF-8 string. 
+* @param targetLength The length of target, or -1 if null-terminated. 
+* @param status A pointer to an UErrorCode to receive any errors 
+* @return The result of comparing the strings; one of UCOL_EQUAL, 
+* UCOL_GREATER, UCOL_LESS 
+* @see ucol_greater 
+* @see ucol_greaterOrEqual 
+* @see ucol_equal 
+* @draft ICU 50 
+*/ 
+U_DRAFT UCollationResult U_EXPORT2
+ucol_strcollUTF8(
+        const UCollator *coll,
+        const char      *source,
+        int32_t         sourceLength,
+        const char      *target,
+        int32_t         targetLength,
+        UErrorCode      *status);
+
 /**
  * Determine if one string is greater than another.
  * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER
index b1b53964208d695b0b5a334e7637e6510c48602e..12f9ac3ef58f3ff2b35b9615f81e799b6a1386a3 100644 (file)
@@ -279,6 +279,18 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
     UColAttributeValue norm = ucol_getAttribute(myCollation, UCOL_NORMALIZATION_MODE, &status);
 
     UCharIterator sIter, tIter;
+
+    compareResult  = ucol_strcoll(myCollation, source, sLen, target, tLen);
+    if (compareResult != result) {
+        log_err("ucol_strcoll with explicit length returned wrong result (%i exp. %i): %s, %s\n",
+            compareResult, result, aescstrdup(source,-1), aescstrdup(target,-1));
+    }
+    compareResulta = ucol_strcoll(myCollation, source, -1,   target, -1); 
+    if (compareResulta != result) {
+        log_err("ucol_strcoll with null terminated strings returned wrong result (%i exp. %i): %s, %s\n",
+            compareResult, result, aescstrdup(source,-1), aescstrdup(target,-1));
+    }
+
     uiter_setString(&sIter, source, sLen);
     uiter_setString(&tIter, target, tLen);
     compareResultIter = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
@@ -286,42 +298,65 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
         log_err("different results in iterative comparison for UTF-16 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
     }
 
-    /* convert the strings to UTF-8 and do try comparing with char iterator */
-    if(getTestOption(QUICK_OPTION) <= 0) { /*!QUICK*/
-      char utf8Source[256], utf8Target[256];
-      int32_t utf8SourceLen = 0, utf8TargetLen = 0;
-      u_strToUTF8(utf8Source, 256, &utf8SourceLen, source, sLen, &status);
-      if(U_FAILURE(status)) { /* probably buffer is not big enough */
-        log_verbose("Src UTF-8 buffer too small! Will not compare!\n");
-      } else {
-        u_strToUTF8(utf8Target, 256, &utf8TargetLen, target, tLen, &status);
-        if(U_SUCCESS(status)) { /* probably buffer is not big enough */
-          UCollationResult compareResultUTF8 = result, compareResultUTF8Norm = result;
-          /*UCharIterator sIter, tIter;*/
-          /*log_verbose("Strings converted to UTF-8:%s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));*/
-          uiter_setUTF8(&sIter, utf8Source, utf8SourceLen);
-          uiter_setUTF8(&tIter, utf8Target, utf8TargetLen);
-       /*uiter_setString(&sIter, source, sLen);
-      uiter_setString(&tIter, target, tLen);*/
-          compareResultUTF8 = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
-          ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
-          sIter.move(&sIter, 0, UITER_START);
-          tIter.move(&tIter, 0, UITER_START);
-          compareResultUTF8Norm = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
-          ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, norm, &status);
-          if(compareResultUTF8 != compareResultIter) {
-            log_err("different results in iterative comparison for UTF-16 and UTF-8 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
-          }
-          if(compareResultUTF8 != compareResultUTF8Norm) {
-            log_err("different results in iterative when normalization is turned on with UTF-8 strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
-          }
+    /* convert the strings to UTF-8 and do try comparing with char iterator and ucol_strcollUTF8 */
+    {
+        char utf8Source[256], utf8Target[256];
+        int32_t utf8SourceLen = 0, utf8TargetLen = 0;
+
+        u_strToUTF8(utf8Source, 256, &utf8SourceLen, source, sLen, &status);
+        if(U_FAILURE(status)) { /* probably buffer is not big enough */
+            log_verbose("Src UTF-8 buffer too small! Will not compare!\n");
         } else {
-          log_verbose("Target UTF-8 buffer too small! Did not compare!\n");
-        }
-        if(U_FAILURE(status)) {
-          log_verbose("UTF-8 strcoll failed! Ignoring result\n");
+            u_strToUTF8(utf8Target, 256, &utf8TargetLen, target, tLen, &status);
+            if(U_SUCCESS(status)) {
+                {
+                    /* ucol_strcollUTF8 */
+                    compareResulta = ucol_strcollUTF8(myCollation, utf8Source, utf8SourceLen, utf8Target, utf8TargetLen, &status);
+                    if (U_FAILURE(status)) {
+                        log_err("Error in ucol_strcollUTF8 with explicit length\n");
+                        status = U_ZERO_ERROR;
+                    } else if (compareResulta != result) {
+                        log_err("ucol_strcollUTF8 with explicit length returned wrong result (%i exp. %i): %s, %s\n",
+                            compareResulta, result, aescstrdup(source,-1), aescstrdup(target,-1));
+                    }
+                    compareResulta = ucol_strcollUTF8(myCollation, utf8Source, -1, utf8Target, -1, &status);
+                    if (U_FAILURE(status)) {
+                        log_err("Error in ucol_strcollUTF8 with null terminated strings\n");
+                        status = U_ZERO_ERROR;
+                    } else if (compareResulta != result) {
+                        log_err("ucol_strcollUTF8 with null terminated strings returned wrong result (%i exp. %i): %s, %s\n",
+                            compareResulta, result, aescstrdup(source,-1), aescstrdup(target,-1));
+                    }
+                }
+
+                {
+                    /* char iterator over UTF8 */
+                    UCollationResult compareResultUTF8Iter = result, compareResultUTF8IterNorm = result;
+
+                    uiter_setUTF8(&sIter, utf8Source, utf8SourceLen);
+                    uiter_setUTF8(&tIter, utf8Target, utf8TargetLen);
+                    compareResultUTF8Iter = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
+
+                    ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
+                    sIter.move(&sIter, 0, UITER_START);
+                    tIter.move(&tIter, 0, UITER_START);
+                    compareResultUTF8IterNorm = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
+
+                    ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, norm, &status);
+                    if(compareResultUTF8Iter != compareResultIter) {
+                        log_err("different results in iterative comparison for UTF-16 and UTF-8 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
+                    }
+                    if(compareResultUTF8Iter != compareResultUTF8IterNorm) {
+                        log_err("different results in iterative when normalization is turned on with UTF-8 strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
+                    }
+                }
+            } else {
+                log_verbose("Target UTF-8 buffer too small! Did not compare!\n");
+            }
+            if(U_FAILURE(status)) {
+                log_verbose("UTF-8 strcoll failed! Ignoring result\n");
+            }
         }
-      }
     }
 
     /* testing the partial sortkeys */
@@ -358,13 +393,6 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
       /*log_verbose("\n");*/
     }
 
-    
-    compareResult  = ucol_strcoll(myCollation, source, sLen, target, tLen);
-    compareResulta = ucol_strcoll(myCollation, source, -1,   target, -1); 
-    if (compareResult != compareResulta) {
-        log_err("ucol_strcoll result from null terminated and explicit length strings differs.\n");
-    }
-
     sortklen1=ucol_getSortKey(myCollation, source, sLen,  NULL, 0);
     sortklen2=ucol_getSortKey(myCollation, target, tLen,  NULL, 0);
 
index 3450ae4f22bb54360c39978e7fcb77aaf3f0f5c5..ddb7bd1e555593ac30cdb7b38426856a1d9d2736 100644 (file)
@@ -169,6 +169,42 @@ static void doAssert(int condition, const char *message)
     }
 }
 
+#define UTF8_BUF_SIZE 128
+
+static void doStrcoll(const UCollator* coll, const UChar* src, int32_t srcLen, const UChar* tgt, int32_t tgtLen,
+                    UCollationResult expected, const char *message) {
+    UErrorCode err = U_ZERO_ERROR;
+    char srcU8[UTF8_BUF_SIZE], tgtU8[UTF8_BUF_SIZE];
+    int32_t srcU8Len = -1, tgtU8Len = -1;
+    int32_t len = 0;
+
+    if (ucol_strcoll(coll, src, srcLen, tgt, tgtLen) != expected) {
+        log_err("ERROR :  %s\n", message);
+    }
+
+    u_strToUTF8(srcU8, UTF8_BUF_SIZE, &len, src, srcLen, &err);
+    if (U_FAILURE(err) || len >= UTF8_BUF_SIZE) {
+        log_err("ERROR : UTF-8 conversion error\n");
+        return;
+    }
+    if (srcLen >= 0) {
+        srcU8Len = len;
+    }
+    u_strToUTF8(tgtU8, UTF8_BUF_SIZE, &len, tgt, tgtLen, &err);
+    if (U_FAILURE(err) || len >= UTF8_BUF_SIZE) {
+        log_err("ERROR : UTF-8 conversion error\n");
+        return;
+    }
+    if (tgtLen >= 0) {
+        tgtU8Len = len;
+    }
+
+    if (ucol_strcollUTF8(coll, srcU8, srcU8Len, tgtU8, tgtU8Len, &err) != expected
+        || U_FAILURE(err)) {
+        log_err("ERROR: %s (strcollUTF8)\n", message);
+    }
+}
+
 #if 0
 /* We don't have default rules, at least not in the previous sense */
 void TestGetDefaultRules(){
@@ -233,7 +269,8 @@ void TestProperty()
     UCollator *col, *ruled;
     UChar *disName;
     int32_t len = 0;
-    UChar *source, *target;
+    UChar source[12], target[12];
+    char sourceU8[36], targetU8[36];
     int32_t tempLength;
     UErrorCode status = U_ZERO_ERROR;
     /*
@@ -279,37 +316,31 @@ void TestProperty()
               versionUCAArray[0], versionUCAArray[1], versionUCAArray[2], versionUCAArray[3]);
     }
 
-    source=(UChar*)malloc(sizeof(UChar) * 12);
-    target=(UChar*)malloc(sizeof(UChar) * 12);
-
-
     u_uastrcpy(source, "ab");
     u_uastrcpy(target, "abc");
 
-    doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS), "ab < abc comparison failed");
+    doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "ab < abc comparison failed");
 
     u_uastrcpy(source, "ab");
     u_uastrcpy(target, "AB");
 
-    doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS), "ab < AB comparison failed");
-/*    u_uastrcpy(source, "black-bird");
-    u_uastrcpy(target, "blackbird"); */
-    u_uastrcpy(target, "black-bird");
+    doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "ab < AB comparison failed");
+
     u_uastrcpy(source, "blackbird");
+    u_uastrcpy(target, "black-bird");
+
+    doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_GREATER, "black-bird > blackbird comparison failed");
 
-    doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_GREATER),
-        "black-bird > blackbird comparison failed");
     u_uastrcpy(source, "black bird");
     u_uastrcpy(target, "black-bird");
-    doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS),
-        "black bird < black-bird comparison failed");
+
+    doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "black bird < black-bird comparison failed");
+
     u_uastrcpy(source, "Hello");
     u_uastrcpy(target, "hello");
 
-    doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_GREATER),
-        "Hello > hello comparison failed");
-    free(source);
-    free(target);
+    doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_GREATER, "Hello > hello comparison failed");
+
     log_verbose("Test ucol_strcoll ends.\n");
 
     log_verbose("testing ucol_getStrength() method ...\n");