From: Dmitry Stogov Date: Fri, 29 Jun 2007 13:58:34 +0000 (+0000) Subject: Allowed loading XML from unicode strings X-Git-Tag: BEFORE_IMPORT_OF_MYSQLND~344 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=85279a4dce572e6c3b3300cd47339a65292ad02f;p=php Allowed loading XML from unicode strings --- diff --git a/ext/dom/document.c b/ext/dom/document.c index 46cc9b775a..30b31a3240 100644 --- a/ext/dom/document.c +++ b/ext/dom/document.c @@ -32,6 +32,8 @@ #include #endif +#include "ext/libxml/php_libxml.h" + typedef struct _idsIterator idsIterator; struct _idsIterator { xmlChar *elementId; @@ -1532,7 +1534,7 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) { xmlDoc *docp = NULL, *newdoc; dom_doc_propsptr doc_prop; dom_object *intern; - char *source; + zstr source; int source_len, refcount, ret; zend_uchar source_type = IS_STRING; long options = 0; @@ -1542,14 +1544,8 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) { id = NULL; } - if (mode == DOM_LOAD_FILE) { - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &source, &source_len, &source_type, &options) == FAILURE) { - return; - } - } else { - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|l", &source, &source_len, &options) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &source, &source_len, &source_type, &options) == FAILURE) { + return; } if (!source_len) { @@ -1558,15 +1554,19 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) { } if (source_type == IS_UNICODE) { - if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) { - RETURN_FALSE; + if (mode == DOM_LOAD_FILE) { + if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) { + RETURN_FALSE; + } + } else { + source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC); } } - newdoc = dom_document_parser(id, mode, source, options TSRMLS_CC); + newdoc = dom_document_parser(id, mode, source.s, options TSRMLS_CC); if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } if (!newdoc) @@ -1860,13 +1860,13 @@ PHP_FUNCTION(dom_document_validate) #if defined(LIBXML_SCHEMAS_ENABLED) -static void -_dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type) +static void _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type) { zval *id; xmlDoc *docp; dom_object *intern; - char *source = NULL, *valid_file = NULL; + zstr source = NULL_ZSTR; + char *valid_file = NULL; int source_len = 0; xmlSchemaParserCtxtPtr parser; xmlSchemaPtr sptr; @@ -1875,14 +1875,8 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type) char resolved_path[MAXPATHLEN + 1]; zend_uchar source_type = IS_STRING; - if (type == DOM_LOAD_FILE) { - if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) { - return; - } - } else { - if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "OS", &id, dom_document_class_entry, &source, &source_len) == FAILURE) { - return; - } + if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) { + return; } if (source_len == 0) { @@ -1895,15 +1889,15 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type) switch (type) { case DOM_LOAD_FILE: if (source_type == IS_UNICODE) { - if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) { + if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) { RETURN_FALSE; } } - valid_file = _dom_get_valid_file_path(source, resolved_path, MAXPATHLEN TSRMLS_CC); + valid_file = _dom_get_valid_file_path(source.s, resolved_path, MAXPATHLEN TSRMLS_CC); if (!valid_file) { if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid Schema file source"); RETURN_FALSE; @@ -1911,13 +1905,19 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type) parser = xmlSchemaNewParserCtxt(valid_file); if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } break; case DOM_LOAD_STRING: - parser = xmlSchemaNewMemParserCtxt(source, source_len); + if (source_type == IS_UNICODE) { + source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC); + } + parser = xmlSchemaNewMemParserCtxt(source.s, source_len); /* If loading from memory, we need to set the base directory for the document but it is not apparent how to do that for schema's */ + if (source_type == IS_UNICODE) { + efree(source.s); + } break; default: return; @@ -1976,7 +1976,8 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type) zval *id; xmlDoc *docp; dom_object *intern; - char *source = NULL, *valid_file = NULL; + zstr source = NULL_ZSTR; + char *valid_file = NULL; int source_len = 0; xmlRelaxNGParserCtxtPtr parser; xmlRelaxNGPtr sptr; @@ -1985,14 +1986,8 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type) char resolved_path[MAXPATHLEN + 1]; zend_uchar source_type = IS_STRING; - if (type == DOM_LOAD_FILE) { - if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) { - return; - } - } else { - if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "OS", &id, dom_document_class_entry, &source, &source_len) == FAILURE) { - return; - } + if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) { + return; } if (source_len == 0) { @@ -2005,27 +2000,33 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type) switch (type) { case DOM_LOAD_FILE: if (source_type == IS_UNICODE) { - if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) { + if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) { RETURN_FALSE; } } - valid_file = _dom_get_valid_file_path(source, resolved_path, MAXPATHLEN TSRMLS_CC); + valid_file = _dom_get_valid_file_path(source.s, resolved_path, MAXPATHLEN TSRMLS_CC); if (!valid_file) { if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid RelaxNG file source"); RETURN_FALSE; } parser = xmlRelaxNGNewParserCtxt(valid_file); if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } break; case DOM_LOAD_STRING: - parser = xmlRelaxNGNewMemParserCtxt(source, source_len); + if (source_type == IS_UNICODE) { + source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC); + } + parser = xmlRelaxNGNewMemParserCtxt(source.s, source_len); /* If loading from memory, we need to set the base directory for the document but it is not apparent how to do that for schema's */ + if (source_type == IS_UNICODE) { + efree(source.s); + } break; default: return; @@ -2087,21 +2088,15 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) xmlDoc *docp = NULL, *newdoc; dom_object *intern; dom_doc_propsptr doc_prop; - char *source; + zstr source; int source_len, refcount, ret; htmlParserCtxtPtr ctxt; zend_uchar source_type = IS_STRING; id = getThis(); - if (mode == DOM_LOAD_FILE) { - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &source, &source_len, &source_type) == FAILURE) { - return; - } - } else { - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &source, &source_len) == FAILURE) { - return; - } + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &source, &source_len, &source_type) == FAILURE) { + return; } if (!source_len) { @@ -2111,19 +2106,26 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) if (mode == DOM_LOAD_FILE) { if (source_type == IS_UNICODE) { - if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) { + if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) { RETURN_FALSE; } } - ctxt = htmlCreateFileParserCtxt(source, NULL); + ctxt = htmlCreateFileParserCtxt(source.s, NULL); if (source_type == IS_UNICODE) { - efree(source); + efree(source.s); } } else { - source_len = xmlStrlen(source); - ctxt = htmlCreateMemoryParserCtxt(source, source_len); + if (source_type == IS_UNICODE) { + source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC); + } + + ctxt = htmlCreateMemoryParserCtxt(source.s, source_len); + + if (source_type == IS_UNICODE) { + efree(source.s); + } } if (!ctxt) { diff --git a/ext/dom/tests/dom002u.phpt b/ext/dom/tests/dom002u.phpt new file mode 100755 index 0000000000..c6576b69bd --- /dev/null +++ b/ext/dom/tests/dom002u.phpt @@ -0,0 +1,82 @@ +--TEST-- +Test 2u: getElementsByTagName() / getElementsByTagNameNS() +--SKIPIF-- + +--FILE-- + + + + + + + +HERE; + +function dump($elems) { + foreach ($elems as $elem) { + var_dump($elem->nodeName); + dump($elem->childNodes); + } +} + +$dom = new DOMDocument(); +$dom->loadXML($xml); +$doc = $dom->documentElement; +dump($dom->getElementsByTagName('bar')); +dump($doc->getElementsByTagName('bar')); +dump($dom->getElementsByTagNameNS('http://www.example.com/ns/fubar', 'bar')); +dump($doc->getElementsByTagNameNS('http://www.example.com/ns/fubar', 'bar')); +?> +--EXPECT-- +string(3) "bar" +string(5) "test1" +string(3) "bar" +string(5) "test2" +string(9) "fubar:bar" +string(5) "test3" +string(9) "fubar:bar" +string(5) "test4" +string(3) "bar" +string(5) "test1" +string(3) "bar" +string(5) "test2" +string(9) "fubar:bar" +string(5) "test3" +string(9) "fubar:bar" +string(5) "test4" +string(9) "fubar:bar" +string(5) "test3" +string(9) "fubar:bar" +string(5) "test4" +string(9) "fubar:bar" +string(5) "test3" +string(9) "fubar:bar" +string(5) "test4" +--UEXPECT-- +unicode(3) "bar" +unicode(5) "test1" +unicode(3) "bar" +unicode(5) "test2" +unicode(9) "fubar:bar" +unicode(5) "test3" +unicode(9) "fubar:bar" +unicode(5) "test4" +unicode(3) "bar" +unicode(5) "test1" +unicode(3) "bar" +unicode(5) "test2" +unicode(9) "fubar:bar" +unicode(5) "test3" +unicode(9) "fubar:bar" +unicode(5) "test4" +unicode(9) "fubar:bar" +unicode(5) "test3" +unicode(9) "fubar:bar" +unicode(5) "test4" +unicode(9) "fubar:bar" +unicode(5) "test3" +unicode(9) "fubar:bar" +unicode(5) "test4" diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index e69f504a4f..aacc467eae 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -1059,6 +1059,107 @@ void php_libxml_node_decrement_resource(php_libxml_node_object *object TSRMLS_DC } /* }}} */ +PHP_LIBXML_API char* php_libxml_unicode_to_string(UChar *ustr, int ustr_len, int *str_len TSRMLS_DC) +{ + UErrorCode errCode = 0; + char *tmp; + int tmp_len; + + zend_unicode_to_string_ex(UG(utf8_conv), &tmp, &tmp_len, ustr, ustr_len, &errCode); + *str_len = tmp_len; + + /* Substitute uncoding with "utf8" */ + if (tmp[0] == '<' && + tmp[1] == '?' && + tmp[2] == 'x' && + tmp[3] == 'm' && + tmp[4] == 'l') { + char *s = tmp + sizeof("') { + if ((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z')) { + char *attr = s; + char *val; + int attr_len, val_len; + + while ((*s >= 'a' && *s <= 'z') || + (*s >= 'A' && *s <= 'Z') || + (*s >= '0' && *s <= '9') || + (*s == '_')) { + ++s; + } + attr_len = s - attr; + while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') { + ++s; + } + if (*s == '=') { + ++s; + } else { + break; + } + while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') { + ++s; + } + if (*s == '"') { + ++s; + } else { + break; + } + val = s; + while (*s != 0 && *s != '"') { + ++s; + } + if (*s == '"') { + val_len = s - val; + ++s; + } else { + break; + } + + while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') { + ++s; + } + + if (attr_len == sizeof("encoding")-1 && + strncasecmp(attr, "encoding", sizeof("encoding")-1) == 0) { + if (val_len >= sizeof("utf-8")-1) { + val[0] = 'u'; + val[1] = 't'; + val[2] = 'f'; + val[3] = '-'; + val[4] = '8'; + val[5] = '"'; + while (val_len > sizeof("utf-8")-1) { + val[val_len] = ' '; + --val_len; + } + }else if (val_len >= sizeof("utf8")-1) { + val[0] = 'u'; + val[1] = 't'; + val[2] = 'f'; + val[3] = '8'; + val[4] = '"'; + while (val_len > sizeof("utf8")-1) { + val[val_len] = ' '; + --val_len; + } + } else { + /* Encoding name too short */ + break; + } + } + + } else { + break; + } + } + } + return tmp; +} + #ifdef PHP_WIN32 PHP_LIBXML_API BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) { diff --git a/ext/libxml/php_libxml.h b/ext/libxml/php_libxml.h index fc4ca2e11c..d8145a2a10 100644 --- a/ext/libxml/php_libxml.h +++ b/ext/libxml/php_libxml.h @@ -93,6 +93,7 @@ void php_libxml_ctx_error(void *ctx, const char *msg, ...); PHP_LIBXML_API int php_libxml_xmlCheckUTF8(const unsigned char *s); PHP_LIBXML_API zval *php_libxml_switch_context(zval *context TSRMLS_DC); PHP_LIBXML_API void php_libxml_issue_error(int level, const char *msg TSRMLS_DC); +PHP_LIBXML_API char* php_libxml_unicode_to_string(UChar *ustr, int ustr_len, int *str_len TSRMLS_DC); /* Init/shutdown functions*/ PHP_LIBXML_API void php_libxml_initialize(); diff --git a/ext/simplexml/simplexml.c b/ext/simplexml/simplexml.c index f87c3c8b6a..15fbf456f1 100644 --- a/ext/simplexml/simplexml.c +++ b/ext/simplexml/simplexml.c @@ -30,6 +30,7 @@ #include "php_ini.h" #include "ext/standard/info.h" #include "ext/standard/php_string.h" +#include "ext/libxml/php_libxml.h" #include "php_simplexml.h" #include "php_simplexml_exports.h" #include "zend_exceptions.h" @@ -2059,8 +2060,9 @@ PHP_FUNCTION(simplexml_load_file) PHP_FUNCTION(simplexml_load_string) { php_sxe_object *sxe; - char *data; + zstr data; int data_len; + zend_uchar data_type; xmlDocPtr docp; char *ns = NULL; int ns_len = 0; @@ -2068,11 +2070,17 @@ PHP_FUNCTION(simplexml_load_string) zend_class_entry *ce= sxe_class_entry; zend_bool isprefix = 0; - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|C!ls&b", &data, &data_len, &ce, &options, &ns, &ns_len, UG(utf8_conv), &isprefix) == FAILURE) { + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|C!ls&b", &data, &data_len, &data_type, &ce, &options, &ns, &ns_len, UG(utf8_conv), &isprefix) == FAILURE) { return; } - docp = xmlReadMemory(data, data_len, NULL, NULL, options); + if (data_type == IS_UNICODE) { + data.s = php_libxml_unicode_to_string(data.u, data_len, &data_len TSRMLS_CC); + } + docp = xmlReadMemory(data.s, data_len, NULL, NULL, options); + if (data_type == IS_UNICODE) { + efree(data.s); + } if (! docp) { RETURN_FALSE; diff --git a/ext/soap/soap.c b/ext/soap/soap.c index c834cf6c9d..5f48d38053 100644 --- a/ext/soap/soap.c +++ b/ext/soap/soap.c @@ -296,12 +296,9 @@ ZEND_GET_MODULE(soap) char* soap_unicode_to_string(UChar *ustr, int ustr_len TSRMLS_DC) { - UErrorCode errCode = 0; - char *tmp; - int tmp_len; + int dummy; - zend_unicode_to_string_ex(UG(utf8_conv), &tmp, &tmp_len, ustr, ustr_len, &errCode); - return tmp; + return php_libxml_unicode_to_string(ustr, ustr_len, &dummy TSRMLS_CC); } void soap_decode_string(zval *ret, char* str TSRMLS_DC) diff --git a/ext/soap/tests/bugs/bug39815.phpt b/ext/soap/tests/bugs/bug39815.phpt index e23c6b96d1..d3609d791b 100755 --- a/ext/soap/tests/bugs/bug39815.phpt +++ b/ext/soap/tests/bugs/bug39815.phpt @@ -32,10 +32,10 @@ class LocalSoapClient extends SoapClient { $x = new LocalSoapClient(NULL,array('location'=>'test://', 'uri'=>'http://testuri.org', "trace"=>1)); -setlocale(LC_ALL,"sv_SE","sv_SE.ISO8859-1"); +@setlocale(LC_ALL,"sv_SE","sv_SE.ISO8859-1"); var_dump($x->test()); echo $x->__getLastResponse(); -setlocale(LC_ALL,"en_US","en_US.ISO8859-1"); +@setlocale(LC_ALL,"en_US","en_US.ISO8859-1"); var_dump($x->test()); echo $x->__getLastResponse(); --EXPECT--