From 757fb1e01f3cf21ef582c68d9b150ac53bd09e03 Mon Sep 17 00:00:00 2001 From: Andrei Zmievski Date: Thu, 21 Dec 2006 21:47:56 +0000 Subject: [PATCH] Bite the bullet and port the natural comparison algorithm to support UChar strings. Also, simplify the original code. # Argggghh, post-incremented iteration sucks. That means you, U16_* stuff. --- ext/standard/array.c | 33 +++--- ext/standard/php_string.h | 5 + ext/standard/string.c | 24 ++-- ext/standard/strnatcmp.c | 232 ++++++++++++++++++++++++++++++-------- unicode-progress.txt | 14 +-- 5 files changed, 224 insertions(+), 84 deletions(-) diff --git a/ext/standard/array.c b/ext/standard/array.c index 3029953ebb..9c2cb62535 100644 --- a/ext/standard/array.c +++ b/ext/standard/array.c @@ -394,6 +394,7 @@ static int array_natural_general_compare(const void *a, const void *b, int fold_ zval *fval, *sval; zval first, second; int result; + zend_uchar type; f = *((Bucket **) a); s = *((Bucket **) b); @@ -402,20 +403,26 @@ static int array_natural_general_compare(const void *a, const void *b, int fold_ sval = *((zval **) s->pData); first = *fval; second = *sval; - if (Z_TYPE_P(fval) != IS_STRING) { + + type = zend_get_unified_string_type(2 TSRMLS_CC, Z_TYPE_P(fval), Z_TYPE_P(sval)); + if (Z_TYPE_P(fval) != type) { zval_copy_ctor(&first); - convert_to_string(&first); + convert_to_explicit_type(&first, type); } - if (Z_TYPE_P(sval) != IS_STRING) { + if (Z_TYPE_P(sval) != type) { zval_copy_ctor(&second); - convert_to_string(&second); + convert_to_explicit_type(&second, type); } - result = strnatcmp_ex(Z_STRVAL(first), Z_STRLEN(first), Z_STRVAL(second), Z_STRLEN(second), fold_case); + if (type == IS_UNICODE) { + result = u_strnatcmp_ex(Z_USTRVAL(first), Z_USTRLEN(first), Z_USTRVAL(second), Z_USTRLEN(second), fold_case); + } else { + result = strnatcmp_ex(Z_STRVAL(first), Z_STRLEN(first), Z_STRVAL(second), Z_STRLEN(second), fold_case); + } - if (Z_TYPE_P(fval) != IS_STRING) + if (Z_TYPE_P(fval) != type) zval_dtor(&first); - if (Z_TYPE_P(sval) != IS_STRING) + if (Z_TYPE_P(sval) != type) zval_dtor(&second); return result; @@ -433,14 +440,14 @@ static int array_natural_case_compare(const void *a, const void *b TSRMLS_DC) static void php_natsort(INTERNAL_FUNCTION_PARAMETERS, int fold_case) { - zval **array; + zval *array; HashTable *target_hash; - if (ZEND_NUM_ARGS() != 1 || zend_get_parameters_ex(1, &array) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", &array) == FAILURE) { + return; } - target_hash = HASH_OF(*array); + target_hash = HASH_OF(array); if (!target_hash) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "The argument should be an array"); return; @@ -460,7 +467,7 @@ static void php_natsort(INTERNAL_FUNCTION_PARAMETERS, int fold_case) } -/* {{{ proto void natsort(array &array_arg) +/* {{{ proto void natsort(array &array_arg) U Sort an array using natural sort */ PHP_FUNCTION(natsort) { @@ -469,7 +476,7 @@ PHP_FUNCTION(natsort) /* }}} */ -/* {{{ proto void natcasesort(array &array_arg) +/* {{{ proto void natcasesort(array &array_arg) U Sort an array using case-insensitive natural sort */ PHP_FUNCTION(natcasesort) { diff --git a/ext/standard/php_string.h b/ext/standard/php_string.h index 0fc493f0e7..d8c14b24f2 100644 --- a/ext/standard/php_string.h +++ b/ext/standard/php_string.h @@ -111,7 +111,12 @@ PHP_MINIT_FUNCTION(nl_langinfo); strnatcmp_ex(a, strlen(a), b, strlen(b), 0) #define strnatcasecmp(a, b) \ strnatcmp_ex(a, strlen(a), b, strlen(b), 1) +#define u_strnatcmp(a, b) \ + u_strnatcmp_ex(a, u_strlen(a), b, strlen(b), 0) +#define u_strnatcasecmp(a, b) \ + u_strnatcmp_ex(a, u_strlen(a), b, strlen(b), 1) PHPAPI int strnatcmp_ex(char const *a, size_t a_len, char const *b, size_t b_len, int fold_case); +PHPAPI int u_strnatcmp_ex(UChar const *a, size_t a_len, UChar const *b, size_t b_len, int fold_case); #ifdef HAVE_LOCALECONV PHPAPI struct lconv *localeconv_r(struct lconv *out); diff --git a/ext/standard/string.c b/ext/standard/string.c index d92d257367..61671eb6bf 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -6970,22 +6970,24 @@ PHP_FUNCTION(count_chars) */ static void php_strnatcmp(INTERNAL_FUNCTION_PARAMETERS, int fold_case) { - zval **s1, **s2; + zstr s1, s2; + int s1_len, s2_len; + zend_uchar type; - if (ZEND_NUM_ARGS()!=2 || zend_get_parameters_ex(2, &s1, &s2) == FAILURE) { - WRONG_PARAM_COUNT; + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "TT", &s1, &s1_len, + &type, &s2, &s2_len, &type) == FAILURE) { + return; } - convert_to_string_ex(s1); - convert_to_string_ex(s2); - - RETURN_LONG(strnatcmp_ex(Z_STRVAL_PP(s1), Z_STRLEN_PP(s1), - Z_STRVAL_PP(s2), Z_STRLEN_PP(s2), - fold_case)); + if (type == IS_UNICODE) { + RETURN_LONG(u_strnatcmp_ex(s1.u, s1_len, s2.u, s2_len, fold_case)); + } else { + RETURN_LONG(strnatcmp_ex(s1.s, s1_len, s2.s, s2_len, fold_case)); + } } /* }}} */ -/* {{{ proto int strnatcmp(string s1, string s2) +/* {{{ proto int strnatcmp(string s1, string s2) U Returns the result of string comparison using 'natural' algorithm */ PHP_FUNCTION(strnatcmp) { @@ -7083,7 +7085,7 @@ PHP_FUNCTION(localeconv) } /* }}} */ -/* {{{ proto int strnatcasecmp(string s1, string s2) +/* {{{ proto int strnatcasecmp(string s1, string s2) U Returns the result of case-insensitive string comparison using 'natural' algorithm */ PHP_FUNCTION(strnatcasecmp) { diff --git a/ext/standard/strnatcmp.c b/ext/standard/strnatcmp.c index e1f491a3df..39ab9a49c7 100644 --- a/ext/standard/strnatcmp.c +++ b/ext/standard/strnatcmp.c @@ -1,9 +1,9 @@ /* -*- mode: c; c-file-style: "k&r" -*- - Modified for PHP by Andrei Zmievski + Modified for PHP by Andrei Zmievski strnatcmp.c -- Perform 'natural order' comparisons of strings in C. - Copyright (C) 2000 by Martin Pool + Copyright (C) 2000, 2004 by Martin Pool This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages @@ -43,7 +43,7 @@ static char const *version UNUSED = /* {{{ compare_right */ static int -compare_right(char const **a, char const *aend, char const **b, char const *bend) +compare_right(char const *a, char const *b) { int bias = 0; @@ -51,20 +51,22 @@ compare_right(char const **a, char const *aend, char const **b, char const *bend value wins, but we can't know that it will until we've scanned both numbers to know that they have the same magnitude, so we remember it in BIAS. */ - for(;; (*a)++, (*b)++) { - if ((*a == aend || !isdigit((int)(unsigned char)**a)) && - (*b == bend || !isdigit((int)(unsigned char)**b))) + for(;; a++, b++) { + if (!isdigit((int)(unsigned char)*a) && + !isdigit((int)(unsigned char)*b)) return bias; - else if (*a == aend || !isdigit((int)(unsigned char)**a)) + else if (!isdigit((int)(unsigned char)*a)) return -1; - else if (*b == bend || !isdigit((int)(unsigned char)**b)) + else if (!isdigit((int)(unsigned char)*b)) return +1; - else if (**a < **b) { + else if (*a < *b) { if (!bias) bias = -1; - } else if (**a > **b) { + } else if (*a > *b) { if (!bias) bias = +1; + } else if (!*a && !*b) { + return bias; } } @@ -75,21 +77,21 @@ compare_right(char const **a, char const *aend, char const **b, char const *bend /* {{{ compare_left */ static int -compare_left(char const **a, char const *aend, char const **b, char const *bend) +compare_left(char const *a, char const *b) { /* Compare two left-aligned numbers: the first to have a different value wins. */ - for(;; (*a)++, (*b)++) { - if ((*a == aend || !isdigit((int)(unsigned char)**a)) && - (*b == bend || !isdigit((int)(unsigned char)**b))) + for(;; a++, b++) { + if (!isdigit((int)(unsigned char)*a) && + !isdigit((int)(unsigned char)*b)) return 0; - else if (*a == aend || !isdigit((int)(unsigned char)**a)) + else if (!isdigit((int)(unsigned char)*a)) return -1; - else if (*b == bend || !isdigit((int)(unsigned char)**b)) + else if (!isdigit((int)(unsigned char)*b)) return +1; - else if (**a < **b) + else if (*a < *b) return -1; - else if (**a > **b) + else if (*a > *b) return +1; } @@ -102,46 +104,40 @@ compare_left(char const **a, char const *aend, char const **b, char const *bend) PHPAPI int strnatcmp_ex(char const *a, size_t a_len, char const *b, size_t b_len, int fold_case) { char ca, cb; - char const *ap, *bp; - char const *aend = a + a_len, - *bend = b + b_len; + int ai, bi; int fractional, result; - if (a_len == 0 || b_len == 0) - return a_len - b_len; - - ap = a; - bp = b; + ai = bi = 0; while (1) { - ca = *ap; cb = *bp; + ca = a[ai]; cb = b[bi]; /* skip over leading spaces or zeros */ while (isspace((int)(unsigned char)ca)) - ca = *++ap; + ca = a[++ai]; while (isspace((int)(unsigned char)cb)) - cb = *++bp; + cb = b[++bi]; /* process run of digits */ if (isdigit((int)(unsigned char)ca) && isdigit((int)(unsigned char)cb)) { fractional = (ca == '0' || cb == '0'); - if (fractional) - result = compare_left(&ap, aend, &bp, bend); - else - result = compare_right(&ap, aend, &bp, bend); - - if (result != 0) - return result; - else if (ap == aend && bp == bend) - /* End of the strings. Let caller sort them out. */ - return 0; - else { - /* Keep on comparing from the current point. */ - ca = *ap; cb = *bp; + if (fractional) { + if ((result = compare_left(a+ai, b+bi)) != 0) { + return result; + } + } else { + if ((result = compare_right(a+ai, b+bi)) != 0) + return result; } } + if (!ca && !cb) { + /* The strings compare the same. Perhaps the caller + will want to call strcmp to break the tie. */ + return 0; + } + if (fold_case) { ca = toupper((int)(unsigned char)ca); cb = toupper((int)(unsigned char)cb); @@ -152,19 +148,159 @@ PHPAPI int strnatcmp_ex(char const *a, size_t a_len, char const *b, size_t b_len else if (ca > cb) return +1; - ++ap; ++bp; - if (ap >= aend && bp >= bend) + ++ai; ++bi; + } +} +/* }}} */ + +/* {{{ u_compare_right + */ +static int +u_compare_right(UChar const *a, int a_len, int *a_curr, UChar const *b, int b_len, int *b_curr) +{ + UChar32 ca, cb; + int a_off, b_off; + int bias = 0; + + /* The longest run of digits wins. That aside, the greatest + value wins, but we can't know that it will until we've scanned + both numbers to know that they have the same magnitude, so we + remember it in BIAS. */ + + for (;;) { + a_off = *a_curr; + b_off = *b_curr; + U16_NEXT(a, a_off, a_len, ca); + U16_NEXT(b, b_off, b_len, cb); + + if (!u_isdigit(ca) && !u_isdigit(cb)) { + return bias; + } else if (!u_isdigit(ca)) { + return -1; + } else if (!u_isdigit(cb)) { + return +1; + } else if (ca < cb) { + if (!bias) + bias = -1; + } else if (ca > cb) { + if (!bias) + bias = +1; + } else if (ca == 0 && cb == 0) { + return bias; + } + *a_curr = a_off; + *b_curr = b_off; + } + + return 0; +} +/* }}} */ + +/* {{{ u_compare_left + */ +static int +u_compare_left(UChar const *a, int a_len, int *a_curr, UChar const *b, int b_len, int *b_curr) +{ + int a_off, b_off; + UChar32 ca, cb; + + /* Compare two left-aligned numbers: the first to have a + different value wins. */ + for (;;) { + a_off = *a_curr; + b_off = *b_curr; + U16_NEXT(a, a_off, a_len, ca); + U16_NEXT(b, b_off, b_len, cb); + + if (!u_isdigit(ca) && !u_isdigit(cb)) { + return 0; + } else if (!u_isdigit(ca)) { + return -1; + } else if (!u_isdigit(cb)) { + return +1; + } else if (ca < cb) { + return -1; + } else if (ca > cb) { + return +1; + } + *a_curr = a_off; + *b_curr = b_off; + } + + return 0; +} +/* }}} */ + +/* {{{ u_strnatcmp_ex + */ +PHPAPI int u_strnatcmp_ex(UChar const *a, size_t a_len, UChar const *b, size_t b_len, int fold_case) +{ + UChar ca, cb; + UChar const *ap, *bp; + int fractional, result; + int a_off, b_off; + int a_curr, b_curr; + + if (a_len == 0 || b_len == 0) + return a_len - b_len; + + ap = a; + bp = b; + a_curr = b_curr = 0; + + while (1) { + a_off = a_curr; + b_off = b_curr; + U16_NEXT(a, a_curr, a_len, ca); + U16_NEXT(b, b_curr, b_len, cb); + + /* skip over leading spaces */ + for ( ; u_isspace(ca) && a_curr < a_len; ) { + a_off = a_curr; + U16_NEXT(a, a_curr, a_len, ca); + } + + for ( ; u_isspace(cb) && b_curr < b_len; ) { + b_off = b_curr; + U16_NEXT(b, b_curr, b_len, cb); + } + + /* process run of digits */ + if (u_isdigit(ca) && u_isdigit(cb)) { + fractional = (ca == 0x30 /*'0'*/ || cb == 0x30 /*'0'*/); + + if (fractional) { + if ((result = u_compare_left(a, a_len, &a_off, b, b_len, &b_off)) != 0) { + return result; + } + } else { + if ((result = u_compare_right(a, a_len, &a_off, b, b_len, &b_off)) != 0) { + return result; + } + } + + a_curr = a_off; + b_curr = b_off; + } + + if (ca == 0 && cb == 0) { /* The strings compare the same. Perhaps the caller will want to call strcmp to break the tie. */ return 0; - else if (ap >= aend) + } + + if (fold_case) { + ca = u_toupper(ca); + cb = u_toupper(cb); + } + + if (ca < cb) return -1; - else if (bp >= bend) - return 1; + else if (ca > cb) + return +1; } } /* }}} */ - /* * Local variables: * tab-width: 4 diff --git a/unicode-progress.txt b/unicode-progress.txt index 8e8bc303cc..c8ba7af02b 100644 --- a/unicode-progress.txt +++ b/unicode-progress.txt @@ -11,15 +11,6 @@ ext/standard Params API, what encoding to use for the message, handling email option - array.c - ------- - natsort(), natcasesort() - Params API - Either port strnatcmp() to support Unicode or maybe use ICU's - numeric collation. Update: can't seem to get the right collation - parameters to duplicate strnatcmp() functionality. Conclusion: port - to support Unicode. - string.c -------- parse_str() @@ -28,9 +19,6 @@ ext/standard sscanf() Params API. Rest - no idea yet. - strnatcmp(), strnatcasecmp() - Params API. The rest depends on porting of strnatcmp.c - wordwrap() Upgrade, do wordwrapping on codepoint (or glyph ?) level, maybe use additional whitespace chars instead of just space. @@ -538,6 +526,7 @@ ext/standard in_array() min() max() + natsort(), natcasesort() range() shuffle() @@ -609,6 +598,7 @@ ext/standard stripslashes() stripos() stristr() + strnatcmp(), strnatcasecmp() strpbrk() strpos() strrchr() -- 2.40.0