From 5418ae7976e3d756db21d9023a2afa70cd9866f4 Mon Sep 17 00:00:00 2001 From: Andrei Zmievski Date: Sat, 11 Feb 2006 00:16:43 +0000 Subject: [PATCH] Implement character/word/line/sentence iterators and the reverse counterparts. --- ext/unicode/unicode_iterators.c | 124 +++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 3 deletions(-) diff --git a/ext/unicode/unicode_iterators.c b/ext/unicode/unicode_iterators.c index 0674799366..47dc7f7e1d 100644 --- a/ext/unicode/unicode_iterators.c +++ b/ext/unicode/unicode_iterators.c @@ -28,11 +28,16 @@ #include "php.h" #include "zend_interfaces.h" #include "zend_exceptions.h" +#include typedef enum { ITER_CODE_UNIT, ITER_CODE_POINT, ITER_COMB_SEQUENCE, + ITER_CHARACTER, + ITER_WORD, + ITER_LINE, + ITER_SENTENCE, ITER_TYPE_LAST, } text_iter_type; @@ -60,6 +65,12 @@ typedef struct { int32_t start; int32_t end; } cs; + struct { + UBreakIterator *iter; + int32_t index; + int32_t start; + int32_t end; + } brk; } u; } text_iter_obj; @@ -76,6 +87,13 @@ typedef struct { void (*rewind) (text_iter_obj* object TSRMLS_DC); } text_iter_ops; +enum UBreakIteratorType brk_type_map[] = { + UBRK_CHARACTER, + UBRK_WORD, + UBRK_LINE, + UBRK_SENTENCE, +}; + PHPAPI zend_class_entry* text_iterator_aggregate_ce; PHPAPI zend_class_entry* text_iterator_ce; PHPAPI zend_class_entry* rev_text_iterator_ce; @@ -276,12 +294,95 @@ static text_iter_ops text_iter_cs_ops = { }; +/* UBreakIterator Character Ops */ + +static int text_iter_brk_char_valid(text_iter_obj* object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + return (object->u.brk.start != UBRK_DONE); + } else { + return (object->u.brk.end != UBRK_DONE); + } +} + +static void text_iter_brk_char_current(text_iter_obj* object TSRMLS_DC) +{ + uint32_t length; + int32_t start = object->u.brk.start; + int32_t end = object->u.brk.end; + + if (object->flags & ITER_REVERSE) { + if (end == UBRK_DONE) { + end = object->text_len; + } + } else { + if (start == UBRK_DONE) { + start = 0; + } + } + length = end - start; + if (length > object->current_alloc-1) { + object->current_alloc = length+1; + Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); + } + u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length); + Z_USTRVAL_P(object->current)[length] = 0; + Z_USTRLEN_P(object->current) = length; +} + +static int text_iter_brk_char_key(text_iter_obj* object TSRMLS_DC) +{ + return object->u.brk.index; +} + +static void text_iter_brk_char_next(text_iter_obj* object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + if (object->u.brk.start != UBRK_DONE) { + object->u.brk.end = object->u.brk.start; + object->u.brk.start = ubrk_previous(object->u.brk.iter); + object->u.brk.index++; + } + } else { + if (object->u.brk.end != UBRK_DONE) { + object->u.brk.start = object->u.brk.end; + object->u.brk.end = ubrk_next(object->u.brk.iter); + object->u.brk.index++; + } + } +} + +static void text_iter_brk_char_rewind(text_iter_obj *object TSRMLS_DC) +{ + if (object->flags & ITER_REVERSE) { + object->u.brk.end = ubrk_last(object->u.brk.iter); + object->u.brk.start = ubrk_previous(object->u.brk.iter); + } else { + object->u.brk.start = ubrk_first(object->u.brk.iter); + object->u.brk.end = ubrk_next(object->u.brk.iter); + } + object->u.brk.index = 0; +} + +static text_iter_ops text_iter_brk_ops = { + text_iter_brk_char_valid, + text_iter_brk_char_current, + text_iter_brk_char_key, + text_iter_brk_char_next, + text_iter_brk_char_rewind, +}; + + /* Ops array */ static text_iter_ops* iter_ops[] = { &text_iter_cu_ops, &text_iter_cp_ops, &text_iter_cs_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, + &text_iter_brk_ops, }; /* Iterator Funcs */ @@ -376,6 +477,9 @@ static void text_iterator_free_storage(void *object TSRMLS_DC) if (intern->text) { efree(intern->text); } + if (intern->type > ITER_CHARACTER && intern->u.brk.iter) { + ubrk_close(intern->u.brk.iter); + } zval_ptr_dtor(&intern->current); efree(object); } @@ -399,6 +503,7 @@ static zend_object_value text_iterator_new(zend_class_entry *class_type TSRMLS_D intern->current_alloc = 3; Z_USTRVAL_P(intern->current) = eumalloc(3); Z_USTRVAL_P(intern->current)[0] = 0; + Z_USTRLEN_P(intern->current) = 0; Z_TYPE_P(intern->current) = IS_UNICODE; retval.handle = zend_objects_store_put(intern, (zend_objects_store_dtor_t)zend_objects_destroy_object, (zend_objects_free_object_storage_t) text_iterator_free_storage, NULL TSRMLS_CC); @@ -426,11 +531,11 @@ PHP_METHOD(TextIterator, __construct) intern->text_len = text_len; if (ZEND_NUM_ARGS() > 1) { ti_type = flags & ITER_TYPE_MASK; - if (ti_type < ITER_TYPE_LAST) { - intern->type = ti_type; - } else { + if (ti_type < 0 || ti_type >= ITER_TYPE_LAST) { php_error(E_WARNING, "Invalid iterator type in TextIterator constructor"); + ti_type = ITER_CODE_POINT; } + intern->type = ti_type; intern->flags = flags; } @@ -438,6 +543,15 @@ PHP_METHOD(TextIterator, __construct) intern->flags |= ITER_REVERSE; } + if (ti_type >= ITER_CHARACTER && ti_type < ITER_TYPE_LAST) { + UErrorCode status = U_ZERO_ERROR; + intern->u.brk.iter = ubrk_open(brk_type_map[ti_type - ITER_CHARACTER], UG(default_locale), text, text_len, &status); + if (!U_SUCCESS(status)) { + php_error(E_RECOVERABLE_ERROR, "Could not create UBreakIterator: %s", u_errorName(status)); + return; + } + } + iter_ops[intern->type]->rewind(intern TSRMLS_CC); } @@ -513,6 +627,10 @@ void php_register_unicode_iterators(TSRMLS_D) zend_declare_class_constant_long(text_iterator_ce, "CODE_UNIT", sizeof("CODE_UNIT")-1, ITER_CODE_UNIT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "CODE_POINT", sizeof("CODE_POINT")-1, ITER_CODE_POINT TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "COMB_SEQUENCE", sizeof("COMB_SEQUENCE")-1, ITER_COMB_SEQUENCE TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "CHARACTER", sizeof("CHARACTER")-1, ITER_CHARACTER TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC); + zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC); } /* -- 2.40.0