From: Andrei Zmievski Date: Sun, 26 Mar 2006 06:19:24 +0000 (+0000) Subject: Implement to-Unicode conversion error behavior. Note the adjusted APIs. X-Git-Tag: RELEASE_1_3~255 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=170942849492363826f6ede8a029c82aeb8e830e;p=php Implement to-Unicode conversion error behavior. Note the adjusted APIs. --- diff --git a/Zend/zend.c b/Zend/zend.c index a16729f4aa..695fe6bc56 100644 --- a/Zend/zend.c +++ b/Zend/zend.c @@ -110,7 +110,8 @@ static ZEND_INI_MH(OnUpdateEncoding) *converter = NULL; } if (*converter) { - zend_set_converter_error_mode(*converter, UG(from_error_mode)); + zend_set_converter_error_mode(*converter, ZEND_FROM_UNICODE, UG(from_error_mode)); + zend_set_converter_error_mode(*converter, ZEND_TO_UNICODE, UG(to_error_mode)); zend_set_converter_subst_char(*converter, UG(from_subst_char)); } @@ -153,15 +154,17 @@ static ZEND_INI_MH(OnUpdateErrorMode) void zend_update_converters_error_behavior(TSRMLS_D) { if (UG(fallback_encoding_conv)) { - zend_set_converter_error_mode(UG(fallback_encoding_conv), UG(from_error_mode)); + zend_set_converter_error_mode(UG(fallback_encoding_conv), ZEND_FROM_UNICODE, UG(from_error_mode)); + zend_set_converter_error_mode(UG(fallback_encoding_conv), ZEND_TO_UNICODE, UG(to_error_mode)); zend_set_converter_subst_char(UG(fallback_encoding_conv), UG(from_subst_char)); } if (UG(runtime_encoding_conv)) { - zend_set_converter_error_mode(UG(runtime_encoding_conv), UG(from_error_mode)); + zend_set_converter_error_mode(UG(runtime_encoding_conv), ZEND_FROM_UNICODE, UG(from_error_mode)); + zend_set_converter_error_mode(UG(runtime_encoding_conv), ZEND_TO_UNICODE, UG(to_error_mode)); zend_set_converter_subst_char(UG(runtime_encoding_conv), UG(from_subst_char)); } if (UG(output_encoding_conv)) { - zend_set_converter_error_mode(UG(output_encoding_conv), UG(from_error_mode)); + zend_set_converter_error_mode(UG(output_encoding_conv), ZEND_FROM_UNICODE, UG(from_error_mode)); zend_set_converter_subst_char(UG(output_encoding_conv), UG(from_subst_char)); } } @@ -911,6 +914,7 @@ static void unicode_globals_ctor(zend_unicode_globals *unicode_globals TSRMLS_DC unicode_globals->from_error_mode = ZEND_CONV_ERROR_SUBST; memset(unicode_globals->from_subst_char, 0, 3 * sizeof(UChar)); zend_codepoint_to_uchar(0x3f, unicode_globals->from_subst_char); + unicode_globals->to_error_mode = ZEND_CONV_ERROR_STOP; zend_hash_init_ex(&unicode_globals->flex_compatible, 0, NULL, NULL, 1, 0); } diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 90bee6e899..43059d4fd1 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -304,7 +304,6 @@ struct _zend_unicode_globals { uint16_t from_error_mode; UChar from_subst_char[3]; uint16_t to_error_mode; - UChar to_subst_char[3]; char *default_locale; UCollator *default_collator; diff --git a/Zend/zend_operators.c b/Zend/zend_operators.c index bb35e8f3e4..ecbdc03b43 100644 --- a/Zend/zend_operators.c +++ b/Zend/zend_operators.c @@ -588,9 +588,7 @@ ZEND_API void _convert_to_unicode(zval *op TSRMLS_DC ZEND_FILE_LINE_DC) case IS_UNICODE: break; case IS_STRING: - if (zval_string_to_unicode(op TSRMLS_CC) == FAILURE) { - zend_error(E_WARNING, "Could not convert binary string to Unicode string"); - } + zval_string_to_unicode(op TSRMLS_CC); return; case IS_BOOL: if (Z_LVAL_P(op)) { diff --git a/Zend/zend_unicode.c b/Zend/zend_unicode.c index 366890431a..f72ea784de 100644 --- a/Zend/zend_unicode.c +++ b/Zend/zend_unicode.c @@ -33,41 +33,69 @@ ZEND_API zend_unicode_globals unicode_globals; ZEND_API zend_class_entry *unicodeConversionException; /* {{{ zend_set_converter_error_mode */ -void zend_set_converter_error_mode(UConverter *conv, uint8_t error_mode) +void zend_set_converter_error_mode(UConverter *conv, zend_conv_direction direction, uint16_t error_mode) { UErrorCode status = U_ZERO_ERROR; - switch (error_mode) { + switch (error_mode & 0xff) { case ZEND_CONV_ERROR_STOP: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &status); break; case ZEND_CONV_ERROR_SKIP: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SKIP, UCNV_SKIP_STOP_ON_ILLEGAL, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SKIP, UCNV_SKIP_STOP_ON_ILLEGAL, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SKIP, UCNV_SKIP_STOP_ON_ILLEGAL, NULL, NULL, &status); break; case ZEND_CONV_ERROR_SUBST: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &status); break; case ZEND_CONV_ERROR_ESCAPE_UNICODE: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE, NULL, NULL, &status); break; case ZEND_CONV_ERROR_ESCAPE_ICU: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU, NULL, NULL, &status); break; case ZEND_CONV_ERROR_ESCAPE_JAVA: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA, NULL, NULL, &status); + else + /* + * use C escape, even though JAVA is requested, so that we don't + * have to expose another constant + */ + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, NULL, NULL, &status); break; case ZEND_CONV_ERROR_ESCAPE_XML_DEC: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, NULL, NULL, &status); break; case ZEND_CONV_ERROR_ESCAPE_XML_HEX: - ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, NULL, NULL, &status); + if (direction == ZEND_FROM_UNICODE) + ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, NULL, NULL, &status); + else + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX, NULL, NULL, &status); break; default: @@ -164,7 +192,7 @@ int zend_copy_converter(UConverter **target, UConverter *source) /* }}} */ /* {{{ zend_convert_to_unicode */ -ZEND_API void zend_convert_to_unicode(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status) +ZEND_API int zend_convert_to_unicode(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status) { UChar *buffer = NULL; UChar *output; @@ -174,7 +202,7 @@ ZEND_API void zend_convert_to_unicode(UConverter *conv, UChar **target, int *tar UConverterType conv_type; if (U_FAILURE(*status)) { - return; + return 0; } ucnv_resetToUnicode(conv); @@ -230,6 +258,8 @@ ZEND_API void zend_convert_to_unicode(UConverter *conv, UChar **target, int *tar buffer[converted] = 0; *target = buffer; *target_len = converted; + + return input - source; } /* }}} */ @@ -326,21 +356,23 @@ ZEND_API void zend_convert_encodings(UConverter *target_conv, UConverter *source /* }}} */ /* {{{ zend_raise_conversion_error_ex */ -ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, int error_char_offset, int use_exception TSRMLS_DC) +ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset, int use_exception TSRMLS_DC) { - UChar err_char[U16_MAX_LENGTH]; - int8_t err_char_len = sizeof(err_char); - UChar32 codepoint; const char *conv_name; UErrorCode status = U_ZERO_ERROR; - char *reason_fmt = "%s (converter %s failed on character {U+%04X} at offset %d)"; - char *no_reason_fmt = "%s"; - char *message_fmt; if (!message) return; - ucnv_getInvalidUChars(conv, err_char, &err_char_len, &status); + if (!conv) { + if (use_exception) { + zend_throw_exception_ex(unicodeConversionException, 0 TSRMLS_CC, "%s", message); + } else { + zend_error(E_WARNING, "%s", message); + } + return; + } + conv_name = ucnv_getName(conv, &status); /* * UTODO @@ -348,15 +380,42 @@ ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, in * internal converter name? ponder */ conv_name = ucnv_getStandardName(conv_name, "MIME", &status); - codepoint = (err_char_len < 2) ? err_char[0] : U16_GET_SUPPLEMENTARY(err_char[0], err_char[1]); - ; + status = U_ZERO_ERROR; + + if (dir == ZEND_FROM_UNICODE) { + UChar err_char[U16_MAX_LENGTH]; + int8_t err_char_len = sizeof(err_char); + UChar32 codepoint; + char *message_fmt = "%s (converter %s failed on character {U+%04X} at offset %d)"; - message_fmt = conv ? reason_fmt : no_reason_fmt; + ucnv_getInvalidUChars(conv, err_char, &err_char_len, &status); + codepoint = (err_char_len < 2) ? err_char[0] : U16_GET_SUPPLEMENTARY(err_char[0], err_char[1]); - if (use_exception) { - zend_throw_exception_ex(unicodeConversionException, 0 TSRMLS_CC, message_fmt, message, conv_name?conv_name:"", codepoint, error_char_offset); + if (use_exception) { + zend_throw_exception_ex(unicodeConversionException, 0 TSRMLS_CC, message_fmt, message, conv_name?conv_name:"", codepoint, error_char_offset-1); + } else { + zend_error(E_WARNING, message_fmt, message, conv_name?conv_name:"", codepoint, error_char_offset-1); + } } else { - zend_error(E_WARNING, message_fmt, message, conv_name?conv_name:"", codepoint, error_char_offset); + char err_char[8]; /* UTF-8 uses up to 8 bytes */ + char buf[32]; /* 4x number of error bytes */ + int8_t err_char_len = sizeof(err_char); + char *message_fmt = "%s (converter %s failed on bytes (%s) at offset %d)"; + char *p; + int i; + + ucnv_getInvalidChars(conv, err_char, &err_char_len, &status); + p = buf; + for (i = 0; i < err_char_len; i++) { + sprintf(p, "0x%02X%s", (unsigned char)err_char[i], (i+1", buf, error_char_offset-err_char_len); + } else { + zend_error(E_WARNING, message_fmt, message, conv_name?conv_name:"", buf, error_char_offset-err_char_len); + } } } /* }}} */ @@ -376,14 +435,14 @@ ZEND_API int zval_unicode_to_string(zval *string, UConverter *conv TSRMLS_DC) num_conv = zend_convert_from_unicode(conv, &s, &s_len, u, u_len, &status); if (U_FAILURE(status)) { - int32_t offset = u_countChar32(u, num_conv)-1; + int32_t offset = u_countChar32(u, num_conv); /* XXX needs to be fixed, but a leak is better than invalid memory if (s) { efree(s); } */ - zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", conv, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); + zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", conv, ZEND_FROM_UNICODE, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); retval = FAILURE; } @@ -400,19 +459,20 @@ ZEND_API int zval_string_to_unicode_ex(zval *string, UConverter *conv) UErrorCode status = U_ZERO_ERROR; int retval = TRUE; UChar *u = NULL; - int u_len; + int u_len, num_conv; char *s = Z_STRVAL_P(string); int s_len = Z_STRLEN_P(string); - Z_TYPE_P(string) = IS_UNICODE; - zend_convert_to_unicode(conv, &u, &u_len, s, s_len, &status); - ZVAL_UNICODEL(string, u, u_len, 0); + num_conv = zend_convert_to_unicode(conv, &u, &u_len, s, s_len, &status); if (U_FAILURE(status)) { + zend_raise_conversion_error_ex("Could not convert binary string to Unicode string", conv, ZEND_TO_UNICODE, num_conv, (UG(to_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); retval = FALSE; } + ZVAL_UNICODEL(string, u, u_len, 0); + efree(s); return retval; } diff --git a/Zend/zend_unicode.h b/Zend/zend_unicode.h index 3a760dfd43..273eb84d82 100644 --- a/Zend/zend_unicode.h +++ b/Zend/zend_unicode.h @@ -54,15 +54,15 @@ extern ZEND_API zend_class_entry *unicodeConversionException; /* internal functions */ int zend_set_converter_encoding(UConverter **converter, const char *encoding); +void zend_set_converter_error_mode(UConverter *conv, zend_conv_direction dir, uint16_t error_mode); void zend_set_converter_subst_char(UConverter *conv, UChar *subst_char); -void zend_set_converter_error_mode(UConverter *conv, uint8_t error_mode); void zend_register_unicode_exceptions(TSRMLS_D); void zend_update_converters_error_behavior(TSRMLS_D); /* API functions */ -ZEND_API void zend_convert_to_unicode(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status); +ZEND_API int zend_convert_to_unicode(UConverter *conv, UChar **target, int *target_len, const char *source, int source_len, UErrorCode *status); ZEND_API int zend_convert_from_unicode(UConverter *conv, char **target, int *target_len, const UChar *source, int source_len, UErrorCode *status); ZEND_API void zend_convert_encodings(UConverter *target_conv, UConverter *source_conv, char **target, int *target_len, const char *source, int source_len, UErrorCode *status); ZEND_API int zval_string_to_unicode_ex(zval *string, UConverter *conv); @@ -78,8 +78,8 @@ ZEND_API int zend_is_valid_identifier(UChar *ident, int ident_len); ZEND_API int zend_normalize_identifier(UChar **dest, int *dest_len, UChar *ident, int ident_len, zend_bool fold_case); #define zend_raise_conversion_error(message, exception) \ - zend_raise_conversion_error_ex(message, NULL, 0, exception TSRMLS_CC) -ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, int error_char_offset, int use_exception TSRMLS_DC); + zend_raise_conversion_error_ex(message, NULL, 0, 0, exception TSRMLS_CC) +ZEND_API void zend_raise_conversion_error_ex(char *message, UConverter *conv, zend_conv_direction dir, int error_char_offset, int use_exception TSRMLS_DC); /* * Function to get a codepoint at position n. Iterates over codepoints starting from the diff --git a/ext/unicode/unicode.c b/ext/unicode/unicode.c index 445219be9f..e019d245c6 100644 --- a/ext/unicode/unicode.c +++ b/ext/unicode/unicode.c @@ -136,6 +136,8 @@ PHP_FUNCTION(unicode_set_error_mode) if (direction == ZEND_FROM_UNICODE) { UG(from_error_mode) = mode; + } else { + UG(to_error_mode) = mode; } zend_update_converters_error_behavior(TSRMLS_C); @@ -143,25 +145,17 @@ PHP_FUNCTION(unicode_set_error_mode) } /* }}} */ -/* {{{ proto bool unicode_set_subst_char(int direction, string character) U - Sets global substitution character for the specified conversion direction */ +/* {{{ proto bool unicode_set_subst_char(string character) U + Sets global substitution character for conversion from Unicode to codepage */ PHP_FUNCTION(unicode_set_subst_char) { - zend_conv_direction direction; UChar *subst_char; UChar32 cp; - int subst_char_len; - long tmp; + int subst_char_len, len; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "lu", &tmp, &subst_char, &subst_char_len) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "u", &subst_char, &subst_char_len) == FAILURE) { return; } - direction = (zend_conv_direction) tmp; - - if (direction != ZEND_FROM_UNICODE && direction != ZEND_TO_UNICODE) { - php_error(E_WARNING, "Invalid conversion direction value"); - RETURN_FALSE; - } if (subst_char_len < 1 ) { php_error(E_WARNING, "Empty substitution character"); @@ -175,12 +169,8 @@ PHP_FUNCTION(unicode_set_subst_char) RETURN_FALSE; } - if (direction == ZEND_FROM_UNICODE) { - int len; - len = zend_codepoint_to_uchar(cp, UG(from_subst_char)); - UG(from_subst_char)[len] = 0; - } - + len = zend_codepoint_to_uchar(cp, UG(from_subst_char)); + UG(from_subst_char)[len] = 0; zend_update_converters_error_behavior(TSRMLS_C); RETURN_TRUE; } diff --git a/main/main.c b/main/main.c index 609d0516cb..ba2dd6cb4c 100644 --- a/main/main.c +++ b/main/main.c @@ -220,7 +220,7 @@ static ZEND_INI_MH(OnUpdateOutputEncoding) UG(output_encoding_conv) = NULL; } if (UG(output_encoding_conv)) { - zend_set_converter_error_mode(UG(output_encoding_conv), UG(from_error_mode)); + zend_set_converter_error_mode(UG(output_encoding_conv), ZEND_FROM_UNICODE, UG(from_error_mode)); zend_set_converter_subst_char(UG(output_encoding_conv), UG(from_subst_char)); if (stage == ZEND_INI_STAGE_RUNTIME) { sapi_update_default_charset(TSRMLS_C); diff --git a/main/streams/filter.c b/main/streams/filter.c index 2b2009ca94..f074ae7c7b 100644 --- a/main/streams/filter.c +++ b/main/streams/filter.c @@ -710,9 +710,9 @@ PHPAPI int _php_stream_bucket_convert(php_stream_bucket *bucket, unsigned char t num_conv = zend_convert_from_unicode(conv, &dest, &destlen, bucket->buf.u, bucket->buflen, &status); if (U_FAILURE(status)) { - int32_t offset = u_countChar32(bucket->buf.u, num_conv)-1; + int32_t offset = u_countChar32(bucket->buf.u, num_conv); - zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", conv, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); + zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", conv, ZEND_FROM_UNICODE, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); } if (bucket->own_buf) { diff --git a/main/streams/streams.c b/main/streams/streams.c index 87814e275b..281cd69408 100755 --- a/main/streams/streams.c +++ b/main/streams/streams.c @@ -1261,9 +1261,9 @@ static size_t _php_stream_write_buffer(php_stream *stream, int buf_type, zstr bu num_conv = zend_convert_from_unicode(stream->output_encoding, &dest, &destlen, buf.u, buflen, &status); if (U_FAILURE(status)) { - int32_t offset = u_countChar32(buf.u, num_conv)-1; + int32_t offset = u_countChar32(buf.u, num_conv); - zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", stream->output_encoding, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); + zend_raise_conversion_error_ex("Could not convert Unicode string to binary string", stream->output_encoding, ZEND_FROM_UNICODE, offset, (UG(from_error_mode) & ZEND_CONV_ERROR_EXCEPTION) TSRMLS_CC); } freeme = buf.s = dest; buflen = destlen; @@ -2293,7 +2293,7 @@ PHPAPI php_stream *_php_stream_open_wrapper_ex(char *path, char *mode, int optio } } else { /* UTODO: (Maybe?) Allow overriding the default error handlers on a per-stream basis via context params */ - zend_set_converter_error_mode(stream->output_encoding, UG(from_error_mode)); + zend_set_converter_error_mode(stream->output_encoding, ZEND_FROM_UNICODE, UG(from_error_mode)); zend_set_converter_subst_char(stream->output_encoding, UG(from_subst_char)); } }