/* UTODO
* - PCRE_NO_UTF8_CHECK option for Unicode strings
- * - add_offset_pair() should convert offset to refer to codepoints or bytes,
- * depending on whether subject string is IS_UNICODE or IS_STRING
*
* php_pcre_split_impl():
* - Avoid the /./ bump for Unicode strings with U8_FWD_1()
PHP_PCRE_BAD_UTF8_ERROR,
};
+typedef struct {
+ char *str;
+ int byte_offset;
+ int cp_offset;
+} offset_map_t;
ZEND_DECLARE_MODULE_GLOBALS(pcre);
/* }}} */
/* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, zend_bool make_unicode TSRMLS_DC)
+static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC)
{
zval *match_pair;
+ int tmp;
ALLOC_ZVAL(match_pair);
array_init(match_pair);
INIT_PZVAL(match_pair);
/* Add (match, offset) to the return value */
- if (make_unicode) {
- add_next_index_utf8_stringl(match_pair, str, len, 1);
- } else {
- add_next_index_stringl(match_pair, str, len, 1);
+ add_next_index_utf8_stringl(match_pair, str, len, 1);
+
+ /* Calculate codepoint offset from the previous chunk */
+ if (offset) {
+ tmp = prev->byte_offset;
+ while (tmp < offset) {
+ U8_FWD_1(prev->str, tmp, offset);
+ prev->cp_offset++;
+ }
+ prev->byte_offset = tmp;
}
- add_next_index_long(match_pair, offset);
+ add_next_index_long(match_pair, prev->cp_offset);
if (name) {
zval_add_ref(&match_pair);
- if (make_unicode) {
+ if (UG(unicode)) {
UErrorCode status = U_ZERO_ERROR;
UChar *u = NULL;
int u_len;
/* If subpatterns array has been passed, fill it in with values. */
if (subpats != NULL) {
+ offset_map_t map = { subject, 0, 0 };
+
/* Try to get the list of substrings and display a warning if failed. */
if (pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
efree(subpat_names);
for (i = 0; i < count; i++) {
if (offset_capture) {
add_offset_pair(match_sets[i], (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, UG(unicode) TSRMLS_CC);
+ offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, &map TSRMLS_CC);
} else {
add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
offsets[(i<<1)+1] - offsets[i<<1], 1);
if (offset_capture) {
add_offset_pair(result_set, (char *)stringlist[i],
offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
- subpat_names[i], UG(unicode) TSRMLS_CC);
+ subpat_names[i], &map TSRMLS_CC);
} else {
if (subpat_names[i]) {
add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
if (offset_capture) {
add_offset_pair(subpats, (char *)stringlist[i],
offsets[(i<<1)+1] - offsets[i<<1],
- offsets[i<<1], subpat_names[i], UG(unicode) TSRMLS_CC);
+ offsets[i<<1], subpat_names[i], &map TSRMLS_CC);
} else {
if (subpat_names[i]) {
add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
int no_empty; /* If NO_EMPTY flag is set */
int delim_capture; /* If delimiters should be captured */
int offset_capture; /* If offsets should be captured */
+ offset_map_t map = { subject, 0, 0 };
no_empty = flags & PREG_SPLIT_NO_EMPTY;
delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
if (offset_capture) {
/* Add (match, offset) pair to the return value */
- add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, UG(unicode) TSRMLS_CC);
+ add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
} else {
/* Add the piece to the return value */
add_next_index_utf8_stringl(return_value, last_match,
if (!no_empty || match_len > 0) {
if (offset_capture) {
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
- offsets[i<<1], NULL, UG(unicode) TSRMLS_CC);
+ offsets[i<<1], NULL, &map TSRMLS_CC);
} else {
add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]],
match_len, 1);
if (offset_capture) {
/* Add the last (match, offset) pair to the return value */
add_offset_pair(return_value, &subject[start_offset],
- subject_len - start_offset, start_offset, NULL, UG(unicode) TSRMLS_CC);
+ subject_len - start_offset, start_offset, NULL, &map TSRMLS_CC);
} else {
/* Add the last piece to the return value */
add_next_index_utf8_stringl(return_value, last_match, subject + subject_len - last_match, 1);