From: Markus Scherer Date: Fri, 3 May 2013 22:46:57 +0000 (+0000) Subject: ICU-10081 stable sort via insertion sort with binary search; test it well X-Git-Tag: milestone-59-0-1~2918 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fe3b2a4118f9bf99c1fd0f3ad61a0b7c3ef9d8d1;p=icu ICU-10081 stable sort via insertion sort with binary search; test it well X-SVN-Rev: 33588 --- diff --git a/icu4c/source/common/uarrsort.c b/icu4c/source/common/uarrsort.c index 8bc967ce161..31ec0c121e5 100644 --- a/icu4c/source/common/uarrsort.c +++ b/icu4c/source/common/uarrsort.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003, International Business Machines +* Copyright (C) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -51,29 +51,77 @@ uprv_uint32Comparator(const void *context, const void *left, const void *right) } } -/* Straight insertion sort from Knuth vol. III, pg. 81 ---------------------- */ +/* Insertion sort using binary search --------------------------------------- */ -static void -doInsertionSort(char *array, int32_t start, int32_t limit, int32_t itemSize, - UComparator *cmp, const void *context, void *pv) { - int32_t i, j; +/* TODO: Make this binary search function more generally available in ICU. */ +/** + * Much like Java Collections.binarySearch(List, Element, Comparator). + * + * @return the index>=0 where the item was found: + * the largest such index, if multiple, for stable sorting; + * or the index<0 for inserting the item at ~index in sorted order + */ +static int32_t +binarySearch(char *array, int32_t limit, int32_t itemSize, void *item, + UComparator *cmp, const void *context) { + int32_t start=0; + UBool found=FALSE; + + /* Binary search until we get down to a tiny sub-array. */ + while((limit-start)>8) { + int32_t i=(start+limit)/2; + int32_t diff=cmp(context, item, array+i*itemSize); + if(diff==0) { + /* + * Found the item. We look for the *last* occurrence of such + * an item, for stable sorting. + * If we knew that there will be only few equal items, + * we could break now and enter the linear search. + * However, if there are many equal items, then it should be + * faster to continue with the binary search. + * It seems likely that we either have all unique items + * or potentially many duplicates. + */ + found=TRUE; + start=i+1; + } else if(diff<0) { + limit=i; + } else { + start=i; + } + } - for(j=start+1; jstart; --i) { - if(/* v>=array[i-1] */ cmp(context, pv, array+(i-1)*itemSize)>=0) { - break; - } +static void +doInsertionSort(char *array, int32_t length, int32_t itemSize, + UComparator *cmp, const void *context, void *pv) { + int32_t j; - /* array[i]=array[i-1]; */ - uprv_memcpy(array+i*itemSize, array+(i-1)*itemSize, itemSize); + for(j=1; j=limit) { - doInsertionSort(array, start, limit, itemSize, cmp, context, px); + doInsertionSort(array+start*itemSize, limit-start, itemSize, cmp, context, px); break; } @@ -229,7 +277,6 @@ uprv_sortArray(void *array, int32_t length, int32_t itemSize, return; } else if(length + #include "unicode/utypes.h" +#include "unicode/ucol.h" +#include "unicode/ustring.h" #include "cmemory.h" #include "cintltst.h" #include "uarrsort.h" @@ -24,7 +28,7 @@ #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) static void -SortTest(void) { +SortTest() { uint16_t small[]={ 8, 1, 2, 5, 4, 3, 7, 6 }; int32_t medium[]={ 10, 8, 1, 2, 5, 5, -1, 6, 4, 3, 9, 7, 5 }; uint32_t large[]={ 21, 10, 20, 19, 11, 12, 13, 10, 10, 10, 10, @@ -80,10 +84,131 @@ SortTest(void) { } } +#if !UCONFIG_NO_COLLATION + +/* + * Fill an array with semi-random short strings. + * Vary them enough to be interesting, but create duplicates. + * With CYCLE=10 characters per STR_LEN=3 string positions there are only 1000 unique strings. + * NUM_LINES should be larger than this. + */ +#define NUM_LINES 10000 +#define STR_LEN 3 +#define CYCLE 10 + +/* + * Use characters beyond the Latin Extended A block to avoid a collator fastpath. + * They should sort unique, so that we can later use a binary comparison for string equality. + */ +#define BASE_CHAR 0x200 + +typedef struct Line { + UChar s[STR_LEN]; + int32_t recordNumber; +} Line; + +static void +printLines(const Line *lines) { +#if 0 + int32_t i, j; + for(i=0; is[j]); + } + printf(" #%5d\n", line->recordNumber); + } +#endif +} + +/* Use a collator so that the comparisons are not essentially free, for simple benchmarking. */ +static int32_t U_EXPORT2 +linesComparator(const void *context, const void *left, const void *right) { + const UCollator *coll=(const UCollator *)context; + const Line *leftLine=(const Line *)left; + const Line *rightLine=(const Line *)right; + /* compare the strings but not the record number */ + return ucol_strcoll(coll, leftLine->s, STR_LEN, rightLine->s, STR_LEN); +} + +static void StableSortTest() { + UErrorCode errorCode=U_ZERO_ERROR; + UCollator *coll; + Line *lines, *p; + UChar s[STR_LEN]; + int32_t i, j; + + coll=ucol_open("root", &errorCode); + if(U_FAILURE(errorCode)) { + log_data_err("ucol_open(root) failed - %s\n", u_errorName(errorCode)); + return; + } + + lines=p=(Line *)uprv_malloc(NUM_LINES*sizeof(Line)); + uprv_memset(lines, 0, NUM_LINES*sizeof(Line)); /* avoid uninitialized memory */ + + for(j=0; js, s, STR_LEN); + p->recordNumber=i; + /* Modify the string for the next line. */ + c=s[j]+1; + if(c==BASE_CHAR+CYCLE) { c=BASE_CHAR; } + s[j]=c; + if(++j==STR_LEN) { j=0; } + ++p; + } + puts("\n* lines before sorting"); + printLines(lines); + + uprv_sortArray(lines, NUM_LINES, (int32_t)sizeof(Line), + linesComparator, coll, TRUE, &errorCode); + if(U_FAILURE(errorCode)) { + log_err("uprv_sortArray() failed - %s\n", u_errorName(errorCode)); + return; + } + puts("* lines after sorting"); + printLines(lines); + + /* Verify that the array is sorted correctly. */ + p=lines; + for(i=1; is, STR_LEN, q->s, STR_LEN, FALSE); + if(diff==0) { + if(p->recordNumber>=q->recordNumber) { + log_err("equal strings %d and %d out of order at sorted index %d\n", + (int)p->recordNumber, (int)q->recordNumber, (int)i); + break; + } + } else { + /* Compare unequal strings with the collator. */ + diff=ucol_strcoll(coll, p->s, STR_LEN, q->s, STR_LEN); + if(diff>=0) { + log_err("unequal strings %d and %d out of order at sorted index %d\n", + (int)p->recordNumber, (int)q->recordNumber, (int)i); + break; + } + } + p=q; + } + + uprv_free(lines); + ucol_close(coll); +} + +#endif /* !UCONFIG_NO_COLLATION */ + void addSortTest(TestNode** root); void addSortTest(TestNode** root) { addTest(root, &SortTest, "tsutil/sorttest/SortTest"); +#if !UCONFIG_NO_COLLATION + addTest(root, &StableSortTest, "tsutil/sorttest/StableSortTest"); +#endif }