]> granicus.if.org Git - php/commitdiff
Unicode support in count_chars(). Note that due to the size of the
authorAndrei Zmievski <andrei@php.net>
Fri, 22 Sep 2006 17:47:09 +0000 (17:47 +0000)
committerAndrei Zmievski <andrei@php.net>
Fri, 22 Sep 2006 17:47:09 +0000 (17:47 +0000)
Unicode character set only mode=1 is supported.

ext/standard/string.c
unicode-progress.txt

index 12e38db77641f0a2c418c2de35c9b942b906d1b2..84252705b32542f2492166a2693a81bccc94579e 100644 (file)
@@ -6295,79 +6295,115 @@ PHP_FUNCTION(str_repeat)
 }
 /* }}} */
 
-/* {{{ proto mixed count_chars(string input [, int mode])
+/* {{{ proto mixed count_chars(string input [, int mode]) U
    Returns info about what characters are used in input */
 PHP_FUNCTION(count_chars)
 {
-       zval **input, **mode;
+       zstr input;
+       int input_len;
+       zend_uchar type;
+       long mode = 0;
        int chars[256];
-       int ac=ZEND_NUM_ARGS();
-       int mymode=0;
+       HashTable uchars;
+       UChar32 cp;
+       int *uchar_cnt_ptr, uchar_cnt;
        unsigned char *buf;
-       int len, inx;
+       int inx;
        char retstr[256];
        int retlen=0;
 
-       if (ac < 1 || ac > 2 || zend_get_parameters_ex(ac, &input, &mode) == FAILURE) {
-               WRONG_PARAM_COUNT;
+       if (zend_parse_parameters(ZEND_NUM_ARGS(), "t|l", &input, &input_len,
+                                                         &type, &mode) == FAILURE) {
+               return;
        }
 
-       convert_to_string_ex(input);
-
-       if (ac == 2) {
-               convert_to_long_ex(mode);
-               mymode = Z_LVAL_PP(mode);
-
-               if (mymode < 0 || mymode > 4) {
+       if (ZEND_NUM_ARGS() > 1) {
+               if (mode < 0 || mode > 4) {
                        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown mode.");
                        RETURN_FALSE;
                }
+               if (UG(unicode) && mode != 1) {
+                       php_error_docref(NULL TSRMLS_DC, E_WARNING, "Only mode=1 is supported with Unicode strings");
+               }
        }
 
-       len = Z_STRLEN_PP(input);
-       buf = (unsigned char *) Z_STRVAL_PP(input);
-       memset((void*) chars, 0, sizeof(chars));
+       if (type == IS_UNICODE) {
+               UChar buf[3];
+               int buf_len;
 
-       while (len > 0) {
-               chars[*buf]++;
-               buf++;
-               len--;
-       }
+               zend_hash_init(&uchars, 0, NULL, NULL, 0);
 
-       if (mymode < 3) {
-               array_init(return_value);
-       }
+               inx = 0;
+               while (inx < input_len) {
+                       U16_NEXT_UNSAFE(input.u, inx, cp);
+                       if (zend_hash_index_find(&uchars, cp, (void**)&uchar_cnt_ptr) == SUCCESS) {
+                               (*uchar_cnt_ptr)++;
+                       } else {
+                               uchar_cnt = 1;
+                               zend_hash_index_update(&uchars, cp, &uchar_cnt, sizeof(int), NULL);
+                       }
+               }
 
-       for (inx = 0; inx < 256; inx++) {
-               switch (mymode) {
-                       case 0:
-                               add_index_long(return_value, inx, chars[inx]);
-                               break;
-                       case 1:
-                               if (chars[inx] != 0) {
-                                       add_index_long(return_value, inx, chars[inx]);
-                               }
-                               break;
-                       case 2:
-                               if (chars[inx] == 0) {
-                                       add_index_long(return_value, inx, chars[inx]);
-                               }
-                               break;
-                       case 3:
-                               if (chars[inx] != 0) {
-                                       retstr[retlen++] = inx;
-                               }
-                               break;
-                       case 4:
-                               if (chars[inx] == 0) {
-                                       retstr[retlen++] = inx;
-                               }
-                               break;
+               if (mode < 3) {
+                       array_init(return_value);
                }
-       }
 
-       if (mymode >= 3 && mymode <= 4) {
-               RETURN_STRINGL(retstr, retlen, 1);
+               for (zend_hash_internal_pointer_reset(&uchars);
+                        zend_hash_get_current_data(&uchars, (void**)&uchar_cnt_ptr) == SUCCESS;
+                        zend_hash_move_forward(&uchars)) {
+
+                       zend_hash_get_current_key(&uchars, NULL, (ulong*)&cp, 0);
+
+                       buf_len = zend_codepoint_to_uchar(cp, buf);
+                       buf[buf_len] = 0;
+                       add_u_assoc_long_ex(return_value, IS_UNICODE, ZSTR(buf), buf_len+1, *uchar_cnt_ptr);
+               }
+
+               zend_hash_destroy(&uchars);
+       } else {
+               buf = (unsigned char *) input.s;
+               memset((void*) chars, 0, sizeof(chars));
+
+               while (input_len > 0) {
+                       chars[*buf]++;
+                       buf++;
+                       input_len--;
+               }
+
+               if (mode < 3) {
+                       array_init(return_value);
+               }
+
+               for (inx = 0; inx < 256; inx++) {
+                       switch (mode) {
+                               case 0:
+                                       add_index_long(return_value, inx, chars[inx]);
+                                       break;
+                               case 1:
+                                       if (chars[inx] != 0) {
+                                               add_index_long(return_value, inx, chars[inx]);
+                                       }
+                                       break;
+                               case 2:
+                                       if (chars[inx] == 0) {
+                                               add_index_long(return_value, inx, chars[inx]);
+                                       }
+                                       break;
+                               case 3:
+                                       if (chars[inx] != 0) {
+                                               retstr[retlen++] = inx;
+                                       }
+                                       break;
+                               case 4:
+                                       if (chars[inx] == 0) {
+                                               retstr[retlen++] = inx;
+                                       }
+                                       break;
+                       }
+               }
+               if (mode >= 3 && mode <= 4) {
+                       RETURN_STRINGL(retstr, retlen, 1);
+               }
        }
 }
 /* }}} */
index e79126d09ad77638670f081f7a1de177a0a93b91..a54f12421c56d23ebde9af561ee455ca6fa80642 100644 (file)
@@ -16,10 +16,6 @@ ext/standard
 
   string.c
   --------
-    count_chars()
-        Params API. Do we really want to go through the whole Unicode table?
-        May need to use hashtable instead of array.
-
     hebrev(), hebrevc()
         Figure out if this is something we can use ICU for, internally.
         Check with Zeev.
@@ -122,7 +118,6 @@ ext/standard
     max()
     range()
     shuffle()
-    strrchr()
 
     end(), prev(), next(), reset(), current(), key()
 
@@ -150,6 +145,7 @@ ext/standard
     bin2hex()
     chr()
     chunk_split()
+    count_chars()
     dirname()
     explode()
     implode()