From d77ad27415a34e4f5908cb262567b7b6f0eca17f Mon Sep 17 00:00:00 2001 From: legale Date: Sat, 9 Feb 2019 20:27:48 +0100 Subject: [PATCH] Implement mb_str_split() RFC: https://wiki.php.net/rfc/mb_str_split --- UPGRADING | 5 + ext/mbstring/mbstring.c | 170 ++++++++++++++++++ ext/mbstring/mbstring.h | 1 + ext/mbstring/tests/mb_str_split_jp.phpt | 76 ++++++++ ext/mbstring/tests/mb_str_split_ru.phpt | 75 ++++++++ .../tests/mb_str_split_utf8_utf16.phpt | 81 +++++++++ 6 files changed, 408 insertions(+) create mode 100644 ext/mbstring/tests/mb_str_split_jp.phpt create mode 100644 ext/mbstring/tests/mb_str_split_ru.phpt create mode 100644 ext/mbstring/tests/mb_str_split_utf8_utf16.phpt diff --git a/UPGRADING b/UPGRADING index fdd1c58326..714ccd2d99 100644 --- a/UPGRADING +++ b/UPGRADING @@ -114,6 +114,11 @@ PHP 7.4 UPGRADE NOTES native variables and create/access data structures defined in C libraries. RFC: https://wiki.php.net/rfc/ffi +- Mbstring: + . Added mb_str_split() function, which provide the same functionality as + str_split(), but operating on code points rather than bytes. + RFC: https://wiki.php.net/rfc/mb_str_split + - OPcache: . Support for preloading code has been added. RFC: https://wiki.php.net/rfc/preload diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3e292a0804..004a1d40d6 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -229,6 +229,12 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_output_handler, 0, 0, 2) ZEND_ARG_INFO(0, status) ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_str_split, 0, 0, 1) + ZEND_ARG_INFO(0, str) + ZEND_ARG_INFO(0, split_length) + ZEND_ARG_INFO(0, encoding) +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_strlen, 0, 0, 1) ZEND_ARG_INFO(0, str) ZEND_ARG_INFO(0, encoding) @@ -526,6 +532,7 @@ static const zend_function_entry mbstring_functions[] = { PHP_FE(mb_parse_str, arginfo_mb_parse_str) PHP_FE(mb_output_handler, arginfo_mb_output_handler) PHP_FE(mb_preferred_mime_name, arginfo_mb_preferred_mime_name) + PHP_FE(mb_str_split, arginfo_mb_str_split) PHP_FE(mb_strlen, arginfo_mb_strlen) PHP_FE(mb_strpos, arginfo_mb_strpos) PHP_FE(mb_strrpos, arginfo_mb_strrpos) @@ -2273,6 +2280,169 @@ PHP_FUNCTION(mb_output_handler) } /* }}} */ +/* {{{ proto array mb_str_split(string str [, int split_length] [, string encoding]) + Convert a multibyte string to an array. If split_length is specified, + break the string down into chunks each split_length characters long. */ + +/* structure to pass split params to the callback */ +struct mbfl_split_params { + zval *return_value; /* php function return value structure pointer */ + mbfl_string *result_string; /* string to store result chunk */ + size_t mb_chunk_length; /* actual chunk length in chars */ + size_t split_length; /* split length in chars */ + mbfl_convert_filter *next_filter; /* widechar to encoding converter */ +}; + +/* callback function to fill split array */ +static int mbfl_split_output(int c, void *data) +{ + struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */ + + (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */ + + if(params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */ + mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */ + mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */ + mbfl_string *chunk = params->result_string; + mbfl_memory_device_result(device, chunk); /* make chunk */ + add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */ + efree(chunk->val); + params->mb_chunk_length = 0; /* reset mb_chunk size */ + } + return 0; +} + +PHP_FUNCTION(mb_str_split) +{ + zend_string *str, *encoding = NULL; + size_t mb_len, chunks, chunk_len; + const char *p, *last; /* pointer for the string cursor and last string char */ + mbfl_string string, result_string; + const mbfl_encoding *mbfl_encoding; + zend_long split_length = 1; + + ZEND_PARSE_PARAMETERS_START(1, 3) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(split_length) + Z_PARAM_STR(encoding) + ZEND_PARSE_PARAMETERS_END(); + + if (split_length <= 0) { + php_error_docref(NULL, E_WARNING, "The length of each segment must be greater than zero"); + RETURN_FALSE; + } + + /* fill mbfl_string structure */ + string.val = (unsigned char *) ZSTR_VAL(str); + string.len = ZSTR_LEN(str); + string.no_language = MBSTRG(language); + string.encoding = php_mb_get_encoding(encoding); + if (!string.encoding) { + RETURN_FALSE; + } + + p = ZSTR_VAL(str); /* string cursor pointer */ + last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */ + + mbfl_encoding = string.encoding; + + /* first scenario: 1,2,4-bytes fixed width encodings (head part) */ + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */ + mb_len = string.len; + chunk_len = (size_t)split_length; /* chunk length in bytes */ + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */ + mb_len = string.len / 2; + chunk_len = split_length * 2; + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */ + mb_len = string.len / 4; + chunk_len = split_length * 4; + } else if (mbfl_encoding->mblen_table != NULL) { + /* second scenario: variable width encodings with length table */ + char unsigned const *mbtab = mbfl_encoding->mblen_table; + + /* assume that we have 1-bytes characters */ + array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ + + while (p < last) { /* split cycle work until the cursor has reached the last byte */ + char const *chunk_p = p; /* chunk first byte pointer */ + chunk_len = 0; /* chunk length in bytes */ + for (zend_long char_count = 0; char_count < split_length && p < last; ++char_count) { + char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */ + chunk_len += m; + p += m; + } + if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */ + add_next_index_stringl(return_value, chunk_p, chunk_len); + } + return; + } else { + /* third scenario: other multibyte encodings */ + mbfl_convert_filter *filter, *decoder; + + /* assume that we have 1-bytes characters */ + array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ + + /* decoder filter to decode wchar to encoding */ + mbfl_memory_device device; + mbfl_memory_device_init(&device, split_length + 1, 0); + + decoder = mbfl_convert_filter_new( + &mbfl_encoding_wchar, + string.encoding, + mbfl_memory_device_output, + NULL, + &device); + /* if something wrong with the decoded */ + if (decoder == NULL) { + RETURN_FALSE; + } + + /* wchar filter */ + mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */ + struct mbfl_split_params params = { /* init callback function params structure */ + .return_value = return_value, + .result_string = &result_string, + .mb_chunk_length = 0, + .split_length = (size_t)split_length, + .next_filter = decoder, + }; + + filter = mbfl_convert_filter_new( + string.encoding, + &mbfl_encoding_wchar, + mbfl_split_output, + NULL, + ¶ms); + /* if something wrong with the filter */ + if (filter == NULL){ + mbfl_convert_filter_delete(decoder); /* this will free allocated memory for the decoded */ + RETURN_FALSE; + } + + while (p < last - 1) { /* cycle each byte except last with callback function */ + (*filter->filter_function)(*p++, filter); + } + params.mb_chunk_length = split_length - 1; /* force to finish current chunk */ + (*filter->filter_function)(*p++, filter); /*process last char */ + + mbfl_convert_filter_delete(decoder); + mbfl_convert_filter_delete(filter); + return; + } + + /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */ + chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */ + array_init_size(return_value, chunks); + if (chunks != 0) { + for (zend_long i = 0; i < chunks - 1; p += chunk_len, ++i) { + add_next_index_stringl(return_value, p, chunk_len); + } + add_next_index_stringl(return_value, p, last - p); + } +} +/* }}} */ + /* {{{ proto int mb_strlen(string str [, string encoding]) Get character numbers of a string */ PHP_FUNCTION(mb_strlen) diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 37965ec289..7321525064 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -78,6 +78,7 @@ PHP_FUNCTION(mb_substitute_character); PHP_FUNCTION(mb_preferred_mime_name); PHP_FUNCTION(mb_parse_str); PHP_FUNCTION(mb_output_handler); +PHP_FUNCTION(mb_str_split); PHP_FUNCTION(mb_strlen); PHP_FUNCTION(mb_strpos); PHP_FUNCTION(mb_strrpos); diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt new file mode 100644 index 0000000000..84f63030d6 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_jp.phpt @@ -0,0 +1,76 @@ +--TEST-- +mb_str_split() tests for the japanese language +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +BIG-5: a4e9 a5bb +EUC-JP: c6fc cbdc +ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842 +SJIS: 93fa 967b +UTF-16BE: 65e5 672c +UTF-16LE: e565 2c67 +UTF-32BE: 000065e5 0000672c +UTF-32LE: e5650000 2c670000 +UTF-8: e697a5 e69cac diff --git a/ext/mbstring/tests/mb_str_split_ru.phpt b/ext/mbstring/tests/mb_str_split_ru.phpt new file mode 100644 index 0000000000..75e49275d7 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_ru.phpt @@ -0,0 +1,75 @@ +--TEST-- +mb_str_split() tests for the russian language +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +EUC-JP: a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 +CP866: e0 a0 a9 20 e0 a0 a9 20 e0 a0 a9 20 +KOI8-R: d2 c1 ca 20 d2 c1 ca 20 d2 c1 ca 20 +UTF-16BE: 0440 0430 0439 0020 0440 0430 0439 0020 0440 0430 0439 0020 +UTF-16LE: 4004 3004 3904 2000 4004 3004 3904 2000 4004 3004 3904 2000 +UTF-32BE: 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 +UTF-32LE: 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 +UTF-8: d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 diff --git a/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt new file mode 100644 index 0000000000..b8234bb322 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt @@ -0,0 +1,81 @@ +--TEST-- +mb_str_split() tests UTF-8 illegal chars & UTF-16 surrogate pairs +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +UTF-8: l:2 v:3132 l:5 v:33f09280a9 +BAD UTF-8: l:2 v:3132 l:3 v:33f092 +UTF-16BE: l:4 v:d800dc00 l:4 v:dbffdfff +UTF-16LE: l:4 v:00d800dc l:4 v:ffdbffdf +BAD UTF-16BE: l:4 v:d800dc00 l:2 v:003f l:2 v:003f +BAD UTF-16LE: l:4 v:00d800dc l:2 v:3f00 l:2 v:3f00 + + -- 2.50.0