From 755c2cd0d85b65f35abb2d54204fa7d38b820268 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Wed, 8 Dec 2010 11:27:34 +0000 Subject: [PATCH] Removed compile time dependency from ext/mbstring --- NEWS | 1 + Zend/zend_compile.c | 3 - Zend/zend_globals.h | 5 - Zend/zend_multibyte.c | 56 ++++++++--- Zend/zend_multibyte.h | 14 ++- ext/exif/exif.c | 96 ++++++++---------- ext/mbstring/mbstring.c | 117 ++++++++++++++++++++-- ext/standard/html.c | 83 +-------------- main/rfc1867.c | 217 +++++++++++++++++++--------------------- main/rfc1867.h | 13 +++ tests/basic/028.phpt | 104 +++++++++++++++++++ tests/basic/029.phpt | 51 ++++++++++ 12 files changed, 481 insertions(+), 279 deletions(-) create mode 100644 tests/basic/028.phpt create mode 100644 tests/basic/029.phpt diff --git a/NEWS b/NEWS index 54f5e4ca67..c20cf4ee4d 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,7 @@ PHP NEWS . Added multibyte suppport by default. Previosly php had to be compiled with --enable-zend-multibyte. Now it can be enabled or disabled throug zend.multibyte directive in php.ini (Dmitry) + . Removed compile time dependency from ext/mbstring (Dmitry) . Added scalar typehints to the parser and the reflection API. (Ilia, Derick) . Added support for Traits. (Stefan) . Added closure $this support back. (Stas) diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index 5656b2dedc..ac31da4157 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -200,9 +200,6 @@ void zend_init_compiler_data_structures(TSRMLS_D) /* {{{ */ CG(script_encoding_list) = NULL; CG(script_encoding_list_size) = 0; CG(internal_encoding) = NULL; - CG(encoding_detector) = NULL; - CG(encoding_converter) = NULL; - CG(encoding_oddlen) = NULL; CG(encoding_declared) = 0; } /* }}} */ diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 1f6b8d4f1f..8ec2a88c04 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -155,11 +155,6 @@ struct _zend_compiler_globals { zend_encoding *internal_encoding; - /* multibyte utility functions */ - zend_encoding_detector encoding_detector; - zend_encoding_converter encoding_converter; - zend_encoding_oddlen encoding_oddlen; - #ifdef ZTS zval ***static_members_table; int last_static_member; diff --git a/Zend/zend_multibyte.c b/Zend/zend_multibyte.c index 33d8688b37..0843413d05 100644 --- a/Zend/zend_multibyte.c +++ b/Zend/zend_multibyte.c @@ -513,7 +513,36 @@ static zend_encoding *zend_encoding_table[] = { NULL }; +static char* dummy_encoding_detector(const unsigned char *string, size_t length, char *list TSRMLS_DC) +{ + return NULL; +} + +static int dummy_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const char *encoding_to, const char *encoding_from TSRMLS_DC) +{ + return -1; +} +static size_t dummy_encoding_oddlen(const unsigned char *string, size_t length, const char *encoding TSRMLS_DC) +{ + return 0; +} + +static int dummy_encoding_list_checker(const char *encoding_list TSRMLS_DC) +{ + return 0; +} + +static const char* dummy_get_internal_encoding(TSRMLS_D) +{ + return NULL; +} + +ZEND_API zend_encoding_detector zend_multibyte_encoding_detector = dummy_encoding_detector; +ZEND_API zend_encoding_converter zend_multibyte_encoding_converter = dummy_encoding_converter; +ZEND_API zend_encoding_oddlen zend_multibyte_encoding_oddlen = dummy_encoding_oddlen; +ZEND_API zend_encoding_list_checker zend_multibyte_check_encoding_list = dummy_encoding_list_checker; +ZEND_API zend_encoding_name_getter zend_multibyte_get_internal_encoding = dummy_get_internal_encoding; ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list, size_t encoding_list_size TSRMLS_DC) @@ -540,11 +569,13 @@ ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRM return 0; } -ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC) +ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen, zend_encoding_list_checker encoding_list_checker, zend_encoding_name_getter get_internal_encoding TSRMLS_DC) { - CG(encoding_detector) = encoding_detector; - CG(encoding_converter) = encoding_converter; - CG(encoding_oddlen) = encoding_oddlen; + zend_multibyte_encoding_detector = encoding_detector; + zend_multibyte_encoding_converter = encoding_converter; + zend_multibyte_encoding_oddlen = encoding_oddlen; + zend_multibyte_check_encoding_list = encoding_list_checker; + zend_multibyte_get_internal_encoding = get_internal_encoding; return 0; } @@ -659,18 +690,16 @@ static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_leng { size_t oddlen; - if (!CG(encoding_converter)) { + if (zend_multibyte_encoding_converter == dummy_encoding_converter) { return 0; } - if (CG(encoding_oddlen)) { - oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC); - if (oddlen > 0) { - from_length -= oddlen; - } + oddlen = zend_multibyte_encoding_oddlen(from, from_length, from_encoding TSRMLS_CC); + if (oddlen > 0) { + from_length -= oddlen; } - if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) { + if (zend_multibyte_encoding_converter(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) { return 0; } @@ -1053,10 +1082,11 @@ static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime } /* if multiple encodings specified, detect automagically */ - if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) { + if (CG(script_encoding_list_size) > 1 && + zend_multibyte_encoding_detector != dummy_encoding_detector) { list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list), CG(script_encoding_list_size)); - name = CG(encoding_detector)(LANG_SCNG(script_org), + name = zend_multibyte_encoding_detector(LANG_SCNG(script_org), LANG_SCNG(script_org_size), list TSRMLS_CC); if (list) { efree(list); diff --git a/Zend/zend_multibyte.h b/Zend/zend_multibyte.h index 02421a8ab5..94d8417b80 100644 --- a/Zend/zend_multibyte.h +++ b/Zend/zend_multibyte.h @@ -36,6 +36,10 @@ typedef int (*zend_encoding_converter)(unsigned char **to, size_t *to_length, co typedef size_t (*zend_encoding_oddlen)(const unsigned char *string, size_t length, const char *encoding TSRMLS_DC); +typedef int (*zend_encoding_list_checker)(const char *encoding_list TSRMLS_DC); + +typedef const char* (*zend_encoding_name_getter)(TSRMLS_D); + typedef struct _zend_encoding { zend_encoding_filter input_filter; /* escape input filter */ zend_encoding_filter output_filter; /* escape output filter */ @@ -49,10 +53,18 @@ typedef struct _zend_encoding { * zend multibyte APIs */ BEGIN_EXTERN_C() + +/* multibyte utility functions */ +ZEND_API extern zend_encoding_detector zend_multibyte_encoding_detector; +ZEND_API extern zend_encoding_converter zend_multibyte_encoding_converter; +ZEND_API extern zend_encoding_oddlen zend_multibyte_encoding_oddlen; +ZEND_API extern zend_encoding_list_checker zend_multibyte_check_encoding_list; +ZEND_API extern zend_encoding_name_getter zend_multibyte_get_internal_encoding; + ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list, size_t encoding_list_size TSRMLS_DC); ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC); -ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC); +ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen, zend_encoding_list_checker encoding_list_checker, zend_encoding_name_getter get_internal_encoding TSRMLS_DC); ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC); ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name); ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t diff --git a/ext/exif/exif.c b/ext/exif/exif.c index 6a4fcc4a77..381d236153 100644 --- a/ext/exif/exif.c +++ b/ext/exif/exif.c @@ -66,16 +66,6 @@ #include "ext/standard/php_image.h" #include "ext/standard/info.h" -#if defined(PHP_WIN32) || (HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING)) -#define EXIF_USE_MBSTRING 1 -#else -#define EXIF_USE_MBSTRING 0 -#endif - -#if EXIF_USE_MBSTRING -#include "ext/mbstring/mbstring.h" -#endif - /* needed for ssize_t definition */ #include @@ -176,23 +166,19 @@ ZEND_DECLARE_MODULE_GLOBALS(exif) ZEND_INI_MH(OnUpdateEncode) { -#if EXIF_USE_MBSTRING - if (new_value && strlen(new_value) && !php_mb_check_encoding_list(new_value TSRMLS_CC)) { + if (new_value && strlen(new_value) && !zend_multibyte_check_encoding_list(new_value TSRMLS_CC)) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Illegal encoding ignored: '%s'", new_value); return FAILURE; } -#endif return OnUpdateString(entry, new_value, new_value_length, mh_arg1, mh_arg2, mh_arg3, stage TSRMLS_CC); } ZEND_INI_MH(OnUpdateDecode) { -#if EXIF_USE_MBSTRING - if (!php_mb_check_encoding_list(new_value TSRMLS_CC)) { + if (!zend_multibyte_check_encoding_list(new_value TSRMLS_CC)) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Illegal encoding ignored: '%s'", new_value); return FAILURE; } -#endif return OnUpdateString(entry, new_value, new_value_length, mh_arg1, mh_arg2, mh_arg3, stage TSRMLS_CC); } @@ -224,7 +210,11 @@ static PHP_GINIT_FUNCTION(exif) PHP_MINIT_FUNCTION(exif) { REGISTER_INI_ENTRIES(); - REGISTER_LONG_CONSTANT("EXIF_USE_MBSTRING", EXIF_USE_MBSTRING, CONST_CS | CONST_PERSISTENT); + if (zend_hash_exists(&module_registry, "mbstring", sizeof("mbstring"))) { + REGISTER_LONG_CONSTANT("EXIF_USE_MBSTRING", 1, CONST_CS | CONST_PERSISTENT); + } else { + REGISTER_LONG_CONSTANT("EXIF_USE_MBSTRING", 0, CONST_CS | CONST_PERSISTENT); + } return SUCCESS; } /* }}} */ @@ -241,9 +231,7 @@ PHP_MSHUTDOWN_FUNCTION(exif) /* {{{ exif dependencies */ static const zend_module_dep exif_module_deps[] = { ZEND_MOD_REQUIRED("standard") -#if EXIF_USE_MBSTRING - ZEND_MOD_REQUIRED("mbstring") -#endif + ZEND_MOD_OPTIONAL("mbstring") {NULL, NULL, NULL} }; /* }}} */ @@ -2588,7 +2576,6 @@ static int exif_process_undefined(char **result, char *value, size_t byte_count /* {{{ exif_process_string_raw * Copy a string in Exif header to a character string returns length of allocated buffer if any. */ -#if !EXIF_USE_MBSTRING static int exif_process_string_raw(char **result, char *value, size_t byte_count) { /* we cannot use strlcpy - here the problem is that we have to copy NUL * chars up to byte_count, we also have to add a single NUL character to @@ -2602,7 +2589,6 @@ static int exif_process_string_raw(char **result, char *value, size_t byte_count } return 0; } -#endif /* }}} */ /* {{{ exif_process_string @@ -2629,11 +2615,8 @@ static int exif_process_string(char **result, char *value, size_t byte_count TSR static int exif_process_user_comment(image_info_type *ImageInfo, char **pszInfoPtr, char **pszEncoding, char *szValuePtr, int ByteCount TSRMLS_DC) { int a; - -#if EXIF_USE_MBSTRING char *decode; size_t len;; -#endif *pszEncoding = NULL; /* Copy the comment */ @@ -2642,7 +2625,6 @@ static int exif_process_user_comment(image_info_type *ImageInfo, char **pszInfoP *pszEncoding = estrdup((const char*)szValuePtr); szValuePtr = szValuePtr+8; ByteCount -= 8; -#if EXIF_USE_MBSTRING /* First try to detect BOM: ZERO WIDTH NOBREAK SPACE (FEFF 16) * since we have no encoding support for the BOM yet we skip that. */ @@ -2659,34 +2641,38 @@ static int exif_process_user_comment(image_info_type *ImageInfo, char **pszInfoP } else { decode = ImageInfo->decode_unicode_le; } - *pszInfoPtr = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_unicode, decode, &len TSRMLS_CC); + if (zend_multibyte_encoding_converter( + pszInfoPtr, + &len, + szValuePtr, + ByteCount, + ImageInfo->encode_unicode, + decode + TSRMLS_DC) != 0) { + len = exif_process_string_raw(pszInfoPtr, szValuePtr, ByteCount); + } return len; -#else - return exif_process_string_raw(pszInfoPtr, szValuePtr, ByteCount); -#endif - } else - if (!memcmp(szValuePtr, "ASCII\0\0\0", 8)) { + } else if (!memcmp(szValuePtr, "ASCII\0\0\0", 8)) { *pszEncoding = estrdup((const char*)szValuePtr); szValuePtr = szValuePtr+8; ByteCount -= 8; - } else - if (!memcmp(szValuePtr, "JIS\0\0\0\0\0", 8)) { + } else if (!memcmp(szValuePtr, "JIS\0\0\0\0\0", 8)) { /* JIS should be tanslated to MB or we leave it to the user - leave it to the user */ *pszEncoding = estrdup((const char*)szValuePtr); szValuePtr = szValuePtr+8; ByteCount -= 8; -#if EXIF_USE_MBSTRING - if (ImageInfo->motorola_intel) { - *pszInfoPtr = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_jis, ImageInfo->decode_jis_be, &len TSRMLS_CC); - } else { - *pszInfoPtr = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_jis, ImageInfo->decode_jis_le, &len TSRMLS_CC); + if (zend_multibyte_encoding_converter( + pszInfoPtr, + &len, + szValuePtr, + ByteCount, + ImageInfo->encode_jis, + ImageInfo->motorola_intel ? ImageInfo->decode_jis_be : ImageInfo->decode_jis_le + TSRMLS_DC) != 0) { + len = exif_process_string_raw(pszInfoPtr, szValuePtr, ByteCount); } return len; -#else - return exif_process_string_raw(pszInfoPtr, szValuePtr, ByteCount); -#endif - } else - if (!memcmp(szValuePtr, "\0\0\0\0\0\0\0\0", 8)) { + } else if (!memcmp(szValuePtr, "\0\0\0\0\0\0\0\0", 8)) { /* 8 NULL means undefined and should be ASCII... */ *pszEncoding = estrdup("UNDEFINED"); szValuePtr = szValuePtr+8; @@ -2714,19 +2700,17 @@ static int exif_process_unicode(image_info_type *ImageInfo, xp_field_type *xp_fi xp_field->tag = tag; /* Copy the comment */ -#if EXIF_USE_MBSTRING -/* What if MS supports big-endian with XP? */ -/* if (ImageInfo->motorola_intel) { - xp_field->value = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_unicode, ImageInfo->decode_unicode_be, &xp_field->size TSRMLS_CC); - } else { - xp_field->value = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_unicode, ImageInfo->decode_unicode_le, &xp_field->size TSRMLS_CC); - }*/ - xp_field->value = php_mb_convert_encoding(szValuePtr, ByteCount, ImageInfo->encode_unicode, ImageInfo->decode_unicode_le, &xp_field->size TSRMLS_CC); - return xp_field->size; -#else - xp_field->size = exif_process_string_raw(&xp_field->value, szValuePtr, ByteCount); + if (zend_multibyte_encoding_converter( + &xp_field->value, + &xp_field->size, + szValuePtr, + ByteCount, + ImageInfo->encode_unicode, + ImageInfo->motorola_intel ? ImageInfo->decode_unicode_be : ImageInfo->decode_unicode_le + TSRMLS_DC) != 0) { + xp_field->size = exif_process_string_raw(&xp_field->value, szValuePtr, ByteCount); + } return xp_field->size; -#endif } /* }}} */ diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 6c3a3260eb..d4119dda97 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -96,6 +96,7 @@ ZEND_DECLARE_MODULE_GLOBALS(mbstring) static PHP_GINIT_FUNCTION(mbstring); static PHP_GSHUTDOWN_FUNCTION(mbstring); +static const char* php_mb_internal_encoding_name(TSRMLS_D); static size_t php_mb_oddlen(const unsigned char *string, size_t length, const char *encoding TSRMLS_DC); static int php_mb_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const char *encoding_to, const char *encoding_from TSRMLS_DC); static char* php_mb_encoding_detector(const unsigned char *arg_string, size_t arg_length, char *arg_list TSRMLS_DC); @@ -769,7 +770,8 @@ php_mb_parse_encoding_list(const char *value, int value_length, enum mbfl_no_enc /* }}} */ /* {{{ MBSTRING_API php_mb_check_encoding_list */ -MBSTRING_API int php_mb_check_encoding_list(const char *encoding_list TSRMLS_DC) { +MBSTRING_API int php_mb_check_encoding_list(const char *encoding_list TSRMLS_DC) +{ return php_mb_parse_encoding_list(encoding_list, strlen(encoding_list), NULL, NULL, 0 TSRMLS_CC); } /* }}} */ @@ -956,6 +958,76 @@ static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, } /* }}} */ +static char *php_mb_rfc1867_substring(char *start, int len, char quote TSRMLS_DC) +{ + char *result = emalloc(len + 2); + char *resp = result; + int i; + + for (i = 0; i < len && start[i] != quote; ++i) { + if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) { + *resp++ = start[++i]; + } else { + size_t j = php_mb_gpc_mbchar_bytes(start+i TSRMLS_CC); + + while (j-- > 0 && i < len) { + *resp++ = start[i++]; + } + --i; + } + } + + *resp = '\0'; + return result; +} + +static char *php_mb_rfc1867_getword(char *str TSRMLS_DC) /* {{{ */ +{ + while (*str && isspace(*str)) { + ++str; + } + + if (!*str) { + return estrdup(""); + } + + if (*str == '"' || *str == '\'') { + char quote = *str; + + str++; + return php_mb_rfc1867_substring(str, strlen(str), quote TSRMLS_CC); + } else { + char *strend = str; + + while (*strend && !isspace(*strend)) { + ++strend; + } + return php_mb_rfc1867_substring(str, strend - str, 0 TSRMLS_CC); + } +} +/* }}} */ + +static char *php_mb_rfc1867_basename(char *filename TSRMLS_DC) /* {{{ */ +{ + char *s, *tmp; + + /* The \ check should technically be needed for win32 systems only where + * it is a valid path separator. However, IE in all it's wisdom always sends + * the full path of the file on the user's filesystem, which means that unless + * the user does basename() they get a bogus file name. Until IE's user base drops + * to nill or problem is fixed this code must remain enabled for all systems. */ + s = php_mb_strrchr(filename, '\\' TSRMLS_CC); + if ((tmp = php_mb_strrchr(filename, '/' TSRMLS_CC)) > s) { + s = tmp; + } + if (s) { + return s + 1; + } else { + return filename; + } +} +/* }}} */ + /* {{{ php.ini directive handler */ /* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */ static PHP_INI_MH(OnUpdate_mbstring_language) @@ -1353,6 +1425,21 @@ PHP_MINIT_FUNCTION(mbstring) #if HAVE_MBREGEX PHP_MINIT(mb_regex) (INIT_FUNC_ARGS_PASSTHRU); #endif + + zend_multibyte_set_functions( + php_mb_encoding_detector, + php_mb_encoding_converter, + php_mb_oddlen, + php_mb_check_encoding_list, + php_mb_internal_encoding_name TSRMLS_CC); + + php_rfc1867_set_multibyte_callbacks( + php_mb_encoding_translation, + php_mb_gpc_encoding_detector, + php_mb_gpc_encoding_converter, + php_mb_rfc1867_getword, + php_mb_rfc1867_basename); + return SUCCESS; } /* }}} */ @@ -4697,9 +4784,6 @@ static int php_mb_set_zend_encoding(TSRMLS_D) /* 'd better use mbfl_memory_device? */ char *name, *list = NULL; int n, *entry, list_size = 0; - zend_encoding_detector encoding_detector; - zend_encoding_converter encoding_converter; - zend_encoding_oddlen encoding_oddlen; /* notify script encoding to Zend Engine */ entry = MBSTRG(script_encoding_list); @@ -4724,9 +4808,6 @@ static int php_mb_set_zend_encoding(TSRMLS_D) if (list) { efree(list); } - encoding_detector = php_mb_encoding_detector; - encoding_converter = php_mb_encoding_converter; - encoding_oddlen = php_mb_oddlen; /* TODO: make independent from mbstring.encoding_translation? */ if (MBSTRG(encoding_translation)) { @@ -4735,8 +4816,6 @@ static int php_mb_set_zend_encoding(TSRMLS_D) zend_multibyte_set_internal_encoding(name TSRMLS_CC); } - zend_multibyte_set_functions(encoding_detector, encoding_converter, encoding_oddlen TSRMLS_CC); - return 0; } /* }}} */ @@ -4849,6 +4928,26 @@ static size_t php_mb_oddlen(const unsigned char *string, size_t length, const ch } /* }}} */ +/* {{{ const char* php_mb_internal_encoding_name() + * returns name of internal encoding + */ +static const char* php_mb_internal_encoding_name(TSRMLS_D) +{ + const char *name = mbfl_no_encoding2name(MBSTRG(current_internal_encoding)); + + if (!name || + !*name || + (strlen(name) == 4 && + (!memcmp("pass", name, sizeof("pass") - 1) || + !memcmp("auto", name, sizeof("auto") - 1) || + !memcmp("none", name, sizeof("none") - 1)))) { + return NULL; + } + return name; +} +/* }}} */ + + #endif /* HAVE_MBSTRING */ /* diff --git a/ext/standard/html.c b/ext/standard/html.c index a40268115e..6285feadfe 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -54,11 +54,6 @@ #include #endif -#if HAVE_MBSTRING -# include "ext/mbstring/mbstring.h" -ZEND_EXTERN_MODULE_GLOBALS(mbstring) -#endif - #include #include "html_tables.h" @@ -372,7 +367,6 @@ static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC) int i; enum entity_charset charset = cs_utf_8; int len = 0; - zval *uf_result = NULL; /* Default is now UTF-8 */ if (charset_hint == NULL) @@ -381,79 +375,11 @@ static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC) if ((len = strlen(charset_hint)) != 0) { goto det_charset; } -#if HAVE_MBSTRING -#if !defined(COMPILE_DL_MBSTRING) - /* XXX: Ugly things. Why don't we look for a more sophisticated way? */ - switch (MBSTRG(current_internal_encoding)) { - case mbfl_no_encoding_8859_1: - return cs_8859_1; - - case mbfl_no_encoding_utf8: - return cs_utf_8; - - case mbfl_no_encoding_euc_jp: - case mbfl_no_encoding_eucjp_win: - return cs_eucjp; - - case mbfl_no_encoding_sjis: - case mbfl_no_encoding_sjis_open: - case mbfl_no_encoding_cp932: - return cs_sjis; - - case mbfl_no_encoding_cp1252: - return cs_cp1252; - - case mbfl_no_encoding_8859_15: - return cs_8859_15; - - case mbfl_no_encoding_big5: - return cs_big5; - case mbfl_no_encoding_euc_cn: - case mbfl_no_encoding_hz: - case mbfl_no_encoding_cp936: - return cs_gb2312; - - case mbfl_no_encoding_koi8r: - return cs_koi8r; - - case mbfl_no_encoding_cp866: - return cs_cp866; - - case mbfl_no_encoding_cp1251: - return cs_cp1251; - - case mbfl_no_encoding_8859_5: - return cs_8859_5; - - default: - ; - } -#else - { - zval nm_mb_internal_encoding; - - ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0); - - if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) { - - charset_hint = Z_STRVAL_P(uf_result); - len = Z_STRLEN_P(uf_result); - - if ((len == 4) && /* sizeof(none|auto|pass)-1 */ - (!memcmp("pass", charset_hint, sizeof("pass") - 1) || - !memcmp("auto", charset_hint, sizeof("auto") - 1) || - !memcmp("none", charset_hint, sizeof("none") - 1))) { - - charset_hint = NULL; - len = 0; - } else { - goto det_charset; - } - } + charset_hint = (char*)zend_multibyte_get_internal_encoding(TSRMLS_C); + if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { + goto det_charset; } -#endif -#endif charset_hint = SG(default_charset); if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { @@ -514,9 +440,6 @@ det_charset: charset_hint); } } - if (uf_result != NULL) { - zval_ptr_dtor(&uf_result); - } return charset; } /* }}} */ diff --git a/main/rfc1867.c b/main/rfc1867.c index 304272dd52..b0672da2eb 100644 --- a/main/rfc1867.c +++ b/main/rfc1867.c @@ -36,23 +36,49 @@ #define DEBUG_FILE_UPLOAD ZEND_DEBUG -PHPAPI int (*php_rfc1867_callback)(unsigned int event, void *event_data, void **extra TSRMLS_DC) = NULL; +static int dummy_encoding_translation(TSRMLS_D) +{ + return 0; +} -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) -#include "ext/mbstring/mbstring.h" +static php_rfc1867_encoding_translation_t php_rfc1867_encoding_translation = dummy_encoding_translation; +static php_rfc1867_encoding_detector_t php_rfc1867_encoding_detector = NULL; +static php_rfc1867_encoding_converter_t php_rfc1867_encoding_converter = NULL; +static php_rfc1867_getword_t php_rfc1867_getword = NULL; +static php_rfc1867_basename_t php_rfc1867_basename = NULL; + +PHPAPI int (*php_rfc1867_callback)(unsigned int event, void *event_data, void **extra TSRMLS_DC) = NULL; static void safe_php_register_variable(char *var, char *strval, int val_len, zval *track_vars_array, zend_bool override_protection TSRMLS_DC); static void php_flush_gpc_variables(int num_vars, char **val_list, int *len_list, zval *array_ptr TSRMLS_DC) /* {{{ */ { int i; + unsigned int new_val_len; if (num_vars > 0 && - php_mb_gpc_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC) == SUCCESS) { - php_mb_gpc_encoding_converter(val_list, len_list, num_vars, NULL, NULL TSRMLS_CC); + php_rfc1867_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC) == SUCCESS) { + php_rfc1867_encoding_converter(val_list, len_list, num_vars, NULL, NULL TSRMLS_CC); } for (i = 0; i 0 && i < len) { - *resp++ = start[i++]; - } - --i; - } else { - *resp++ = start[i]; - } -#else *resp++ = start[i]; -#endif } } @@ -564,65 +576,29 @@ static char *substring_conf(char *start, int len, char quote TSRMLS_DC) return result; } -static char *php_ap_getword_conf(char **line TSRMLS_DC) +static char *php_ap_getword_conf(char *str TSRMLS_DC) { - char *str = *line, *strend, *res, quote; - -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) - if (php_mb_encoding_translation(TSRMLS_C)) { - int len=strlen(str); - php_mb_gpc_encoding_detector(&str, &len, 1, NULL TSRMLS_CC); - } -#endif - while (*str && isspace(*str)) { ++str; } if (!*str) { - *line = str; return estrdup(""); } - if ((quote = *str) == '"' || quote == '\'') { - strend = str + 1; -look_for_quote: - while (*strend && *strend != quote) { - if (*strend == '\\' && strend[1] && strend[1] == quote) { - strend += 2; - } else { - ++strend; - } - } - if (*strend && *strend == quote) { - char p = *(strend + 1); - if (p != '\r' && p != '\n' && p != '\0') { - strend++; - goto look_for_quote; - } - } - - res = substring_conf(str + 1, strend - str - 1, quote TSRMLS_CC); - - if (*strend == quote) { - ++strend; - } + if (*str == '"' || *str == '\'') { + char quote = *str; + str++; + return substring_conf(str, strlen(str), quote TSRMLS_CC); } else { + char *strend = str; - strend = str; while (*strend && !isspace(*strend)) { ++strend; } - res = substring_conf(str, strend - str, 0 TSRMLS_CC); - } - - while (*strend && isspace(*strend)) { - ++strend; + return substring_conf(str, strend - str, 0 TSRMLS_CC); } - - *line = strend; - return res; } /* @@ -733,10 +709,8 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ int max_file_size = 0, skip_upload = 0, anonindex = 0, is_anonymous; zval *http_post_files = NULL; HashTable *uploaded_files = NULL; -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) int str_len = 0, num_vars = 0, num_vars_max = 2*10, *len_list = NULL; char **val_list = NULL; -#endif multipart_buffer *mbuff; zval *array_ptr = (zval *) arg; int fd = -1; @@ -806,12 +780,11 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ INIT_PZVAL(http_post_files); PG(http_globals)[TRACK_VARS_FILES] = http_post_files; -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) - if (php_mb_encoding_translation(TSRMLS_C)) { + if (php_rfc1867_encoding_translation(TSRMLS_C)) { val_list = (char **)ecalloc(num_vars_max+2, sizeof(char *)); len_list = (int *)ecalloc(num_vars_max+2, sizeof(int)); } -#endif + zend_llist_init(&header, sizeof(mime_header_entry), (llist_dtor_func_t) php_free_hdr_entry, 0); if (php_rfc1867_callback != NULL) { @@ -859,12 +832,36 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ if (param) { efree(param); } - param = php_ap_getword_conf(&pair TSRMLS_CC); + if (php_rfc1867_encoding_translation(TSRMLS_C)) { + if (num_vars >= num_vars_max) { + php_gpc_realloc_buffer(&val_list, &len_list, &num_vars_max, 1 TSRMLS_CC); + } + val_list[num_vars] = pair; + len_list[num_vars] = strlen(pair); + num_vars++; + php_rfc1867_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC); + num_vars--; + param = php_rfc1867_getword(pair TSRMLS_CC); + } else { + param = php_ap_getword_conf(pair TSRMLS_CC); + } } else if (!strcasecmp(key, "filename")) { if (filename) { efree(filename); } - filename = php_ap_getword_conf(&pair TSRMLS_CC); + if (php_rfc1867_encoding_translation(TSRMLS_C)) { + if (num_vars >= num_vars_max) { + php_gpc_realloc_buffer(&val_list, &len_list, &num_vars_max, 1 TSRMLS_CC); + } + val_list[num_vars] = pair; + len_list[num_vars] = strlen(pair); + num_vars++; + php_rfc1867_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC); + num_vars--; + filename = php_rfc1867_getword(pair TSRMLS_CC); + } else { + filename = php_ap_getword_conf(pair TSRMLS_CC); + } } } if (key) { @@ -883,7 +880,10 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ value = estrdup(""); } - if (sapi_module.input_filter(PARSE_POST, param, &value, value_len, &new_val_len TSRMLS_CC)) { + if (php_rfc1867_encoding_translation(TSRMLS_C)) { + /* postpone filtering, callback call and registration */ + php_gpc_stack_variable(param, value, &val_list, &len_list, &num_vars, &num_vars_max TSRMLS_CC); + } else if (sapi_module.input_filter(PARSE_POST, param, &value, value_len, &new_val_len TSRMLS_CC)) { if (php_rfc1867_callback != NULL) { multipart_event_formdata event_formdata; size_t newlength = new_val_len; @@ -900,16 +900,7 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ } new_val_len = newlength; } - -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) - if (php_mb_encoding_translation(TSRMLS_C)) { - php_gpc_stack_variable(param, value, &val_list, &len_list, &num_vars, &num_vars_max TSRMLS_CC); - } else { - safe_php_register_variable(param, value, new_val_len, array_ptr, 0 TSRMLS_CC); - } -#else safe_php_register_variable(param, value, new_val_len, array_ptr, 0 TSRMLS_CC); -#endif } else if (php_rfc1867_callback != NULL) { multipart_event_formdata event_formdata; @@ -1144,30 +1135,25 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ snprintf(lbuf, llen, "%s_name", param); } - /* The \ check should technically be needed for win32 systems only where - * it is a valid path separator. However, IE in all it's wisdom always sends - * the full path of the file on the user's filesystem, which means that unless - * the user does basename() they get a bogus file name. Until IE's user base drops - * to nill or problem is fixed this code must remain enabled for all systems. */ -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) - if (php_mb_encoding_translation(TSRMLS_C)) { + if (php_rfc1867_encoding_translation(TSRMLS_C)) { if (num_vars >= num_vars_max) { php_gpc_realloc_buffer(&val_list, &len_list, &num_vars_max, 1 TSRMLS_CC); } val_list[num_vars] = filename; len_list[num_vars] = strlen(filename); num_vars++; - if (php_mb_gpc_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC) == SUCCESS) { + if (php_rfc1867_encoding_detector(val_list, len_list, num_vars, NULL TSRMLS_CC) == SUCCESS) { str_len = strlen(filename); - php_mb_gpc_encoding_converter(&filename, &str_len, 1, NULL, NULL TSRMLS_CC); - } - s = php_mb_strrchr(filename, '\\' TSRMLS_CC); - if ((tmp = php_mb_strrchr(filename, '/' TSRMLS_CC)) > s) { - s = tmp; + php_rfc1867_encoding_converter(&filename, &str_len, 1, NULL, NULL TSRMLS_CC); } + s = php_rfc1867_basename(filename TSRMLS_CC); num_vars--; } else { -#endif + /* The \ check should technically be needed for win32 systems only where + * it is a valid path separator. However, IE in all it's wisdom always sends + * the full path of the file on the user's filesystem, which means that unless + * the user does basename() they get a bogus file name. Until IE's user base drops + * to nill or problem is fixed this code must remain enabled for all systems. */ s = strrchr(filename, '\\'); if ((tmp = strrchr(filename, '/')) > s) { s = tmp; @@ -1181,17 +1167,15 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ s = tmp > s ? tmp : s; } #endif - -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) + if (s) { + s++; + } else { + s = filename; + } } -#endif if (!is_anonymous) { - if (s && s > filename) { - safe_php_register_variable(lbuf, s+1, strlen(s+1), NULL, 0 TSRMLS_CC); - } else { - safe_php_register_variable(lbuf, filename, strlen(filename), NULL, 0 TSRMLS_CC); - } + safe_php_register_variable(lbuf, s, strlen(s), NULL, 0 TSRMLS_CC); } /* Add $foo[name] */ @@ -1200,11 +1184,7 @@ SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler) /* {{{ */ } else { snprintf(lbuf, llen, "%s[name]", param); } - if (s && s > filename) { - register_http_post_files_variable(lbuf, s+1, http_post_files, 0 TSRMLS_CC); - } else { - register_http_post_files_variable(lbuf, filename, http_post_files, 0 TSRMLS_CC); - } + register_http_post_files_variable(lbuf, s, http_post_files, 0 TSRMLS_CC); efree(filename); s = NULL; @@ -1320,11 +1300,9 @@ fileupload_done: php_rfc1867_callback(MULTIPART_EVENT_END, &event_end, &event_extra_data TSRMLS_CC); } -#if HAVE_MBSTRING && !defined(COMPILE_DL_MBSTRING) - if (php_mb_encoding_translation(TSRMLS_C)) { + if (php_rfc1867_encoding_translation(TSRMLS_C)) { php_flush_gpc_variables(num_vars, val_list, len_list, array_ptr TSRMLS_CC); } -#endif if (lbuf) efree(lbuf); if (abuf) efree(abuf); @@ -1338,6 +1316,21 @@ fileupload_done: } /* }}} */ +SAPI_API void php_rfc1867_set_multibyte_callbacks( + php_rfc1867_encoding_translation_t encoding_translation, + php_rfc1867_encoding_detector_t encoding_detector, + php_rfc1867_encoding_converter_t encoding_converter, + php_rfc1867_getword_t getword, + php_rfc1867_basename_t basename) /* {{{ */ +{ + php_rfc1867_encoding_translation = encoding_translation; + php_rfc1867_encoding_detector = encoding_detector; + php_rfc1867_encoding_converter = encoding_converter; + php_rfc1867_getword = getword; + php_rfc1867_basename = basename; +} +/* }}} */ + /* * Local variables: * tab-width: 4 diff --git a/main/rfc1867.h b/main/rfc1867.h index 60fddf7837..d5067da48f 100644 --- a/main/rfc1867.h +++ b/main/rfc1867.h @@ -67,10 +67,23 @@ typedef struct _multipart_event_end { size_t post_bytes_processed; } multipart_event_end; +typedef int (*php_rfc1867_encoding_translation_t)(TSRMLS_D); +typedef int (*php_rfc1867_encoding_detector_t)(char **arg_string, int *arg_length, int num, char *arg_list TSRMLS_DC); +typedef int (*php_rfc1867_encoding_converter_t)(char **str, int *len, int num, const char *encoding_to, const char *encoding_from TSRMLS_DC); +typedef char* (*php_rfc1867_getword_t)(char *str TSRMLS_DC); +typedef char* (*php_rfc1867_basename_t)(char *str TSRMLS_DC); + SAPI_API SAPI_POST_HANDLER_FUNC(rfc1867_post_handler); void destroy_uploaded_files_hash(TSRMLS_D); void php_rfc1867_register_constants(TSRMLS_D); extern PHPAPI int (*php_rfc1867_callback)(unsigned int event, void *event_data, void **extra TSRMLS_DC); +SAPI_API void php_rfc1867_set_multibyte_callbacks( + php_rfc1867_encoding_translation_t encoding_translation, + php_rfc1867_encoding_detector_t encoding_detector, + php_rfc1867_encoding_converter_t encoding_converter, + php_rfc1867_getword_t getword, + php_rfc1867_basename_t basename); + #endif /* RFC1867_H */ diff --git a/tests/basic/028.phpt b/tests/basic/028.phpt new file mode 100644 index 0000000000..6bdcd575ca --- /dev/null +++ b/tests/basic/028.phpt @@ -0,0 +1,104 @@ +--TEST-- +RFC1867 character quotting +--INI-- +file_uploads=1 +--POST_RAW-- +Content-Type: multipart/form-data; boundary=---------------------------20896060251896012921717172737 +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name=name1 + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name='name2' + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="name3" + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name=name\4 + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name=name\\5 + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name=name\'6 + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name=name\"7 + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name='name\8' + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name='name\\9' + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name='name\'10' + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name='name\"11' + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="name\12" + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="name\\13" + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="name\'14" + +testname +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="name\"15" + +testname +-----------------------------20896060251896012921717172737-- +--FILE-- + +--EXPECTF-- +array(15) { + ["name1"]=> + string(8) "testname" + ["name2"]=> + string(8) "testname" + ["name3"]=> + string(8) "testname" + ["name\\4"]=> + string(8) "testname" + ["name\\5"]=> + string(8) "testname" + ["name\\\'6"]=> + string(8) "testname" + ["name\\\"7"]=> + string(8) "testname" + ["name\\8"]=> + string(8) "testname" + ["name\\9"]=> + string(8) "testname" + ["name\'10"]=> + string(8) "testname" + ["name\\\"11"]=> + string(8) "testname" + ["name\\12"]=> + string(8) "testname" + ["name\\13"]=> + string(8) "testname" + ["name\\\'14"]=> + string(8) "testname" + ["name\"15"]=> + string(8) "testname" +} diff --git a/tests/basic/029.phpt b/tests/basic/029.phpt new file mode 100644 index 0000000000..6d95c0771d --- /dev/null +++ b/tests/basic/029.phpt @@ -0,0 +1,51 @@ +--TEST-- +Shift_JIS request +--SKIPIF-- + +--INI-- +file_uploads=1 +mbstring.encoding_translation=1 +mbstring.http_input=Shift_JIS +mbstring.internal_encoding=UTF-8 +--POST_RAW-- +Content-Type: multipart/form-data; boundary=---------------------------20896060251896012921717172737 +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="—\Ž\”\" + +ƒhƒŒƒ~ƒtƒ@ƒ\ +-----------------------------20896060251896012921717172737 +Content-Disposition: form-data; name="pics"; filename="file1.txt" +Content-Type: text/plain + +file1 + +-----------------------------20896060251896012921717172737-- +--FILE-- + +--EXPECTF-- +array(1) { + ["pics"]=> + array(5) { + ["name"]=> + string(9) "file1.txt" + ["type"]=> + string(10) "text/plain" + ["tmp_name"]=> + string(%d) "%s" + ["error"]=> + int(0) + ["size"]=> + int(6) + } +} +array(1) { + ["予蚕能"]=> + string(18) "ドレミファソ" +} -- 2.40.0