From: Rui Hirokawa Date: Wed, 8 May 2002 12:33:44 +0000 (+0000) Subject: Added conversion support from script character encoding to internal character encodin... X-Git-Tag: php-4.3.0dev-ZendEngine2-Preview1~192 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f30b722f14521fbad2fabe5fdcaa2b60fe97eebb;p=php Added conversion support from script character encoding to internal character encoding. This feature is very useful for japanese who uses Shift_JIS encoding because some of characters in Shift_JIS are including '0x5c' and it causes some troubles on Zend parser. This patch is made by Masaki Fujimoto. --- diff --git a/ext/mbstring/mbfilter.c b/ext/mbstring/mbfilter.c index 3e065fa453..4e335f12fb 100644 --- a/ext/mbstring/mbfilter.c +++ b/ext/mbstring/mbfilter.c @@ -685,12 +685,12 @@ static mbfl_encoding mbfl_encoding_2022jp = { #if defined(HAVE_MBSTR_CN) -static const char *mbfl_encoding_euc_cn_aliases[] = {"EUC_CN", "eucCN", "x-euc-cn", NULL}; +static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", NULL}; static mbfl_encoding mbfl_encoding_euc_cn = { mbfl_no_encoding_euc_cn, "EUC-CN", - "EUC-CN", + "CN-GB", (const char *(*)[])&mbfl_encoding_euc_cn_aliases, mblen_table_euccn, MBFL_ENCTYPE_MBCS @@ -721,12 +721,12 @@ static mbfl_encoding mbfl_encoding_euc_tw = { MBFL_ENCTYPE_MBCS }; -static const char *mbfl_encoding_big5_aliases[] = {"big5", "CP950", NULL}; +static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL}; static mbfl_encoding mbfl_encoding_big5 = { mbfl_no_encoding_big5, "BIG-5", - "BIG-5", + "CN-BIG5", (const char *(*)[])&mbfl_encoding_big5_aliases, mblen_table_big5, MBFL_ENCTYPE_MBCS @@ -6995,7 +6995,53 @@ mbfl_strlen(mbfl_string *string TSRMLS_DC) return len; } +#ifdef ZEND_MULTIBYTE +/* + * oddlen + */ +int +mbfl_oddlen(mbfl_string *string) +{ + int len, n, m, k; + unsigned char *p; + const unsigned char *mbtab; + mbfl_encoding *encoding; + + encoding = mbfl_no2encoding(string->no_encoding); + if (encoding == NULL || string == NULL) { + return -1; + } + len = 0; + if (encoding->flag & MBFL_ENCTYPE_SBCS) { + return 0; + } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + return len % 2; + } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + return len % 4; + } else if (encoding->mblen_table != NULL) { + mbtab = encoding->mblen_table; + n = 0; + p = string->val; + k = string->len; + /* count */ + if (p != NULL) { + while (n < k) { + m = mbtab[*p]; + n += m; + p += m; + }; + } + return n-k; + } else { + /* how can i do ? */ + return 0; + } + /* NOT REACHED */ +} +#endif /* ZEND_MULTIBYTE */ + + /* * strpos */ diff --git a/ext/mbstring/mbfilter.h b/ext/mbstring/mbfilter.h index 65ee94b573..a5077bd578 100644 --- a/ext/mbstring/mbfilter.h +++ b/ext/mbstring/mbfilter.h @@ -461,6 +461,14 @@ mbfl_identify_encoding_no(mbfl_string *string, enum mbfl_no_encoding *elist, int int mbfl_strlen(mbfl_string *string TSRMLS_DC); +#ifdef ZEND_MULTIBYTE +/* + * oddlen + */ +int +mbfl_oddlen(mbfl_string *string); +#endif /* ZEND_MULTIBYTE */ + /* * strpos */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 40d16522d2..5b40ff3f89 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -65,6 +65,10 @@ #include "php_content_types.h" #include "SAPI.h" +#ifdef ZEND_MULTIBYTE +#include "zend_multibyte.h" +#endif /* ZEND_MULTIBYTE */ + #if HAVE_MBSTRING #if HAVE_MBREGEX @@ -524,6 +528,25 @@ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) return SUCCESS; } +#ifdef ZEND_MULTIBYTE +static PHP_INI_MH(OnUpdate_mbstring_script_encoding) +{ + int *list, size; + + if (php_mbstring_parse_encoding_list(new_value, new_value_length, &list, &size, 1)) { + if (MBSTRG(script_encoding_list) != NULL) { + free(MBSTRG(script_encoding_list)); + } + MBSTRG(script_encoding_list) = list; + MBSTRG(script_encoding_list_size) = size; + } else { + return FAILURE; + } + + return SUCCESS; +} +#endif /* ZEND_MULTIBYTE */ + static PHP_INI_MH(OnUpdate_mbstring_substitute_character) { if (new_value != NULL) { @@ -546,6 +569,9 @@ PHP_INI_BEGIN() PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input) PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output) PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding) +#ifdef ZEND_MULTIBYTE + PHP_INI_ENTRY("mbstring.script_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_script_encoding) +#endif /* ZEND_MULTIBYTE */ PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character) STD_PHP_INI_ENTRY("mbstring.func_overload", "0", PHP_INI_SYSTEM, OnUpdateInt, func_overload, zend_mbstring_globals, mbstring_globals) PHP_INI_END() @@ -579,6 +605,10 @@ php_mbstring_init_globals(zend_mbstring_globals *pglobals TSRMLS_DC) MBSTRG(internal_encoding) = mbfl_no_encoding_euc_jp; MBSTRG(current_internal_encoding) = mbfl_no_encoding_euc_jp; #endif +#ifdef ZEND_MULTIBYTE + MBSTRG(script_encoding_list) = NULL; + MBSTRG(script_encoding_list_size) = 0; +#endif /* ZEND_MULTIBYTE */ MBSTRG(http_output_encoding) = mbfl_no_encoding_pass; MBSTRG(current_http_output_encoding) = mbfl_no_encoding_pass; MBSTRG(http_input_identify) = mbfl_no_encoding_invalid; @@ -640,6 +670,11 @@ PHP_MSHUTDOWN_FUNCTION(mbstring) if (MBSTRG(http_input_list)) { free(MBSTRG(http_input_list)); } +#ifdef ZEND_MULTIBYTE + if (MBSTRG(script_encoding_list)) { + free(MBSTRG(script_encoding_list)); + } +#endif /* ZEND_MULTIBYTE */ if (MBSTRG(detect_order_list)) { free(MBSTRG(detect_order_list)); } @@ -858,6 +893,9 @@ PHP_FUNCTION(mb_internal_encoding) RETURN_FALSE; } else { MBSTRG(current_internal_encoding) = no_encoding; +#ifdef ZEND_MULTIBYTE + zend_multibyte_set_internal_encoding(Z_STRVAL_PP(arg1), Z_STRLEN_PP(arg1) TSRMLS_CC); +#endif /* ZEND_MULTIBYTE */ RETURN_TRUE; } } else { @@ -3174,6 +3212,175 @@ PHP_FUNCTION(mb_get_info) } /* }}} */ + +#ifdef ZEND_MULTIBYTE +PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D) +{ + /* 'd better use mbfl_memory_device? */ + char *name, *list = NULL; + int n, *entry, list_size = 0; + zend_encoding_detector encoding_detector; + zend_encoding_converter encoding_converter; + zend_multibyte_oddlen multibyte_oddlen; + + /* notify script encoding to Zend Engine */ + entry = MBSTRG(script_encoding_list); + n = MBSTRG(script_encoding_list_size); + while (n > 0) { + name = (char *)mbfl_no_encoding2name(*entry); + if (name) { + list_size += strlen(name) + 1; + if (!list) + { + list = (char*)emalloc(list_size); + if (!list) + return -1; + *list = (char)NULL; + } + else + { + list = (char*)erealloc(list, list_size); + if (!list) + return -1; + strcat(list, ","); + } + strcat(list, name); + } + entry++; + n--; + } + zend_multibyte_set_script_encoding(list, (list ? strlen(list) : 0) TSRMLS_CC); + if (list) + efree(list); + + encoding_detector = php_mbstring_encoding_detector; + encoding_converter = NULL; + multibyte_oddlen = php_mbstring_oddlen; + +#if defined(MBSTR_ENC_TRANS) + /* notify internal encoding to Zend Engine */ + name = (char*)mbfl_no_encoding2name(MBSTRG(current_internal_encoding)); + zend_multibyte_set_internal_encoding(name, strlen(name) TSRMLS_CC); + + encoding_converter = php_mbstring_encoding_converter; +#endif /* defined(MBSTR_ENC_TRANS) */ + + zend_multibyte_set_functions(encoding_detector, encoding_converter, + multibyte_oddlen TSRMLS_CC); + + return 0; +} + +/* + * mb_detect_encoding (interface for Zend Engine) + */ +char* php_mbstring_encoding_detector(char *arg_string, int arg_length, char *arg_list TSRMLS_DC) +{ + mbfl_string string; + const char *ret; + enum mbfl_no_encoding *elist; + int size, *list; + + /* make encoding list */ + list = NULL; + size = 0; + php_mbstring_parse_encoding_list(arg_list, strlen(arg_list), &list, &size, 0); + if (size <= 0) + return NULL; + + if (size > 0 && list != NULL) { + elist = list; + } else { + elist = MBSTRG(current_detect_order_list); + size = MBSTRG(current_detect_order_list_size); + } + + mbfl_string_init(&string); + string.no_language = MBSTRG(current_language); + string.val = arg_string; + string.len = arg_length; + ret = mbfl_identify_encoding_name(&string, elist, size); + if (list != NULL) { + efree((void *)list); + } + if (ret != NULL) { + return estrdup(ret); + } else { + return NULL; + } +} + + +/* + * mb_convert_encoding (interface for Zend Engine) + */ +int php_mbstring_encoding_converter(char **to, int *to_length, char *from, + int from_length, const char *encoding_to, const char *encoding_from + TSRMLS_DC) +{ + mbfl_string string, result, *ret; + enum mbfl_no_encoding from_encoding, to_encoding; + mbfl_buffer_converter *convd; + + /* new encoding */ + to_encoding = mbfl_name2no_encoding(encoding_to); + if (to_encoding == mbfl_no_encoding_invalid) + return -1; + + /* old encoding */ + from_encoding = mbfl_name2no_encoding(encoding_from); + if (from_encoding == mbfl_no_encoding_invalid) + return -1; + + /* initialize string */ + mbfl_string_init(&string); + mbfl_string_init(&result); + string.no_encoding = from_encoding; + string.no_language = MBSTRG(current_language); + string.val = from; + string.len = from_length; + + /* initialize converter */ + convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len); + if (convd == NULL) + return -1; + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); + mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); + + /* do it */ + ret = mbfl_buffer_converter_feed_result(convd, &string, &result); + if (ret != NULL) { + *to = ret->val; + *to_length = ret->len; + } + mbfl_buffer_converter_delete(convd); + + return ret ? 0 : -1; +} + + +/* + * returns number of odd (e.g. appears only first byte of multibyte + * character) chars + */ +int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC) +{ + mbfl_string mb_string; + + mbfl_string_init(&mb_string); + mb_string.no_language = MBSTRG(current_language); + mb_string.no_encoding = mbfl_name2no_encoding(encoding); + mb_string.val = string; + mb_string.len = length; + + if(mb_string.no_encoding == mbfl_no_encoding_invalid) + return 0; + + return mbfl_oddlen(&mb_string); +} + +#endif /* ZEND_MULTIBYTE */ + #endif /* HAVE_MBSTRING */ /* diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 22ff290c5d..98623e77b4 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -129,6 +129,10 @@ ZEND_BEGIN_MODULE_GLOBALS(mbstring) int current_language; int internal_encoding; int current_internal_encoding; +#ifdef ZEND_MULTIBYTE + int *script_encoding_list; + int script_encoding_list_size; +#endif /* ZEND_MULTIBYTE */ int http_output_encoding; int current_http_output_encoding; int http_input_identify; @@ -177,6 +181,16 @@ struct mb_overload_def { #define MBSTRG(v) (mbstring_globals.v) #endif +#ifdef ZEND_MULTIBYTE +PHPAPI int php_mbstring_set_zend_encoding(TSRMLS_D); +char* php_mbstring_encoding_detector(char *string, int length, char *list + TSRMLS_DC); +int php_mbstring_encoding_converter(char **to, int *to_length, char *from, + int from_length, const char *encoding_to, const char *encoding_from + TSRMLS_DC); +int php_mbstring_oddlen(char *string, int length, const char *encoding TSRMLS_DC); +#endif /* ZEND_MULTIBYTE */ + #else /* HAVE_MBSTRING */ #define mbstring_module_ptr NULL diff --git a/main/main.c b/main/main.c index fcd73d8239..16b47b6cea 100644 --- a/main/main.c +++ b/main/main.c @@ -75,6 +75,10 @@ #include "php_logos.h" #include "php_streams.h" +#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) +#include "ext/mbstring/mbstring.h" +#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */ + #include "SAPI.h" /* }}} */ @@ -1402,6 +1406,9 @@ PHPAPI int php_execute_script(zend_file_handle *primary_file TSRMLS_DC) } else { append_file_p = NULL; } +#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) + php_mbstring_set_zend_encoding(TSRMLS_C); +#endif /* ZEND_MULTIBYTE && HAVE_MBSTRING */ retval = (zend_execute_scripts(ZEND_REQUIRE TSRMLS_CC, NULL, 3, prepend_file_p, primary_file, append_file_p) == SUCCESS); } zend_end_try(); diff --git a/sapi/apache/mod_php4.c b/sapi/apache/mod_php4.c index d3ed4ef0bd..9c23523178 100644 --- a/sapi/apache/mod_php4.c +++ b/sapi/apache/mod_php4.c @@ -21,6 +21,10 @@ #include "php_apache_http.h" +#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) +#include "ext/mbstring/mbstring.h" +#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */ + #undef shutdown /* {{{ Prototypes @@ -459,6 +463,11 @@ static int send_php(request_rec *r, int display_source_mode, char *filename) fh.opened_path = NULL; fh.free_filename = 0; fh.type = ZEND_HANDLE_FILENAME; + +#if defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) + php_mbstring_set_zend_encoding(TSRMLS_C); +#endif /* defined(ZEND_MULTIBYTE) && defined(HAVE_MBSTRING) */ + zend_execute_scripts(ZEND_INCLUDE TSRMLS_CC, NULL, 1, &fh); return OK; }