From: Andrei Zmievski Date: Mon, 27 Mar 2006 03:19:30 +0000 (+0000) Subject: Rewrite unicode_encode() and unicode_decode() functions. Apply the new X-Git-Tag: RELEASE_1_3~246 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b36d2dfef6d2b35fcc9fe22b8be7144c9d2d7b03;p=php Rewrite unicode_encode() and unicode_decode() functions. Apply the new conversion error semantics. --- diff --git a/ext/unicode/unicode.c b/ext/unicode/unicode.c index 23c1a489b2..e468feabd8 100644 --- a/ext/unicode/unicode.c +++ b/ext/unicode/unicode.c @@ -23,92 +23,112 @@ void php_register_unicode_iterators(TSRMLS_D); -/* {{{ proto unicode unicode_decode(string input, string encoding) U - Takes a string in the source encoding and converts it to a UTF-16 unicode string, returning the result */ +/* {{{ proto unicode unicode_decode(binary input, string encoding [, int flags]) U + Takes a binary string converts it to a Unicode string using the specifed encoding */ static PHP_FUNCTION(unicode_decode) { - union { - void *vptr; - char *bin; - } input; - zend_uchar type; - int len; - char *encoding; - int enclen; + char *str, *enc; + int str_len, enc_len; + long flags; + UChar *dest; + int dest_len; UErrorCode status; UConverter *conv = NULL; - UChar *target; - int targetlen; + int num_conv; - if (FAILURE == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ts", &input.vptr, &len, &type, &encoding, &enclen)) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Ss|l", &str, &str_len, &enc, &enc_len, &flags)) { return; } - if (type == IS_UNICODE) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "input string is already unicode"); - RETURN_FALSE; + if (ZEND_NUM_ARGS() > 2) { + if ((flags & 0xff) > ZEND_CONV_ERROR_LAST_ENUM) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "illegal value for conversion error mode"); + RETURN_FALSE; + } + } else { + flags = UG(to_error_mode); } status = U_ZERO_ERROR; - conv = ucnv_open(encoding, &status); - if (!conv) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "could not locate converter for %s", encoding); + conv = ucnv_open(enc, &status); + if (U_FAILURE(status)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "could not create converter for '%s' encoding", enc); RETURN_FALSE; } + zend_set_converter_error_mode(conv, ZEND_TO_UNICODE, flags); + status = U_ZERO_ERROR; - zend_convert_to_unicode(conv, &target, &targetlen, input.bin, len, &status); + num_conv = zend_convert_to_unicode(conv, &dest, &dest_len, str, str_len, &status); if (U_FAILURE(status)) { - /* TODO: error handling semantics ? */ - php_error_docref(NULL TSRMLS_CC, E_WARNING, "conversion was not entirely successful: %d", status); + zend_raise_conversion_error_ex("could not decode binary string", conv, ZEND_TO_UNICODE, num_conv, (flags & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); + efree(dest); + ucnv_close(conv); + RETURN_FALSE; } - RETVAL_UNICODEL(target, targetlen, 0); - ucnv_close(conv); -} -/* }}} */ -/* {{{ proto bool unicode_semantics() U - Check whether unicode semantics are enabled */ -static PHP_FUNCTION(unicode_semantics) -{ - RETURN_BOOL(UG(unicode)); + RETVAL_UNICODEL(dest, dest_len, 0); } /* }}} */ -/* {{{ proto string unicode_encode(unicode input, string encoding) U - Takes a unicode string and converts it to a string in the specified encoding */ +/* {{{ proto binary unicode_encode(unicode input, string encoding [, int flags]) U + Takes a Unicode string and converts it to a binary string using the specified encoding */ static PHP_FUNCTION(unicode_encode) { UChar *uni; - int len; - char *encoding; - int enclen; + char *enc; + int uni_len, enc_len; + long flags; + char *dest; + int dest_len; UErrorCode status; UConverter *conv = NULL; - char *target; - int targetlen; + int num_conv; - if (FAILURE == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "us", &uni, &len, &encoding, &enclen)) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Us|l", &uni, &uni_len, &enc, &enc_len, &flags) == FAILURE) { return; } + if (ZEND_NUM_ARGS() > 2) { + if ((flags & 0xff) > ZEND_CONV_ERROR_LAST_ENUM) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "illegal value for conversion error mode"); + RETURN_FALSE; + } + } else { + flags = UG(from_error_mode); + } + status = U_ZERO_ERROR; - conv = ucnv_open(encoding, &status); - if (!conv) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "could not locate converter for %s", encoding); + conv = ucnv_open(enc, &status); + if (U_FAILURE(status)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "could not create converter for '%s' encoding", enc); RETURN_FALSE; } + zend_set_converter_error_mode(conv, ZEND_FROM_UNICODE, flags); + zend_set_converter_subst_char(conv, UG(from_subst_char)); + status = U_ZERO_ERROR; - zend_convert_from_unicode(conv, &target, &targetlen, uni, len, &status); + num_conv = zend_convert_from_unicode(conv, &dest, &dest_len, uni, uni_len, &status); if (U_FAILURE(status)) { - /* TODO: error handling semantics ? */ - php_error_docref(NULL TSRMLS_CC, E_WARNING, "conversion was not entirely successful: %d", status); + int32_t offset = u_countChar32(uni, num_conv); + zend_raise_conversion_error_ex("could not encode Unicode string", conv, ZEND_FROM_UNICODE, offset, (flags & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); + efree(dest); + ucnv_close(conv); + RETURN_FALSE; } - RETVAL_STRINGL(target, targetlen, 0); - ucnv_close(conv); + + RETVAL_STRINGL(dest, dest_len, 0); +} +/* }}} */ + +/* {{{ proto bool unicode_semantics() U + Check whether unicode semantics are enabled */ +static PHP_FUNCTION(unicode_semantics) +{ + RETURN_BOOL(UG(unicode)); } /* }}} */