]> granicus.if.org Git - php/commitdiff
Allowed loading XML from unicode strings
authorDmitry Stogov <dmitry@php.net>
Fri, 29 Jun 2007 13:58:34 +0000 (13:58 +0000)
committerDmitry Stogov <dmitry@php.net>
Fri, 29 Jun 2007 13:58:34 +0000 (13:58 +0000)
ext/dom/document.c
ext/dom/tests/dom002u.phpt [new file with mode: 0755]
ext/libxml/libxml.c
ext/libxml/php_libxml.h
ext/simplexml/simplexml.c
ext/soap/soap.c
ext/soap/tests/bugs/bug39815.phpt

index 46cc9b775a392c2baed80fbb476f64573a9dd2ad..30b31a3240b79fd1f4e7c0dedeabac245fa388cd 100644 (file)
@@ -32,6 +32,8 @@
 #include <libxml/xmlschemas.h>
 #endif
 
+#include "ext/libxml/php_libxml.h"
+
 typedef struct _idsIterator idsIterator;
 struct _idsIterator {
        xmlChar *elementId;
@@ -1532,7 +1534,7 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
        xmlDoc *docp = NULL, *newdoc;
        dom_doc_propsptr doc_prop;
        dom_object *intern;
-       char *source;
+       zstr source;
        int source_len, refcount, ret;
        zend_uchar source_type = IS_STRING;
        long options = 0;
@@ -1542,14 +1544,8 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
                id = NULL;
        }
 
-       if (mode == DOM_LOAD_FILE) {
-               if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &source, &source_len, &source_type, &options) == FAILURE) {
-                       return;
-               }
-       } else {
-               if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|l", &source, &source_len, &options) == FAILURE) {
-                       return;
-               }
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &source, &source_len, &source_type, &options) == FAILURE) {
+               return;
        }
 
        if (!source_len) {
@@ -1558,15 +1554,19 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
        }
 
        if (source_type == IS_UNICODE) {
-               if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) {
-                       RETURN_FALSE;
+               if (mode == DOM_LOAD_FILE) {
+                       if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) {
+                               RETURN_FALSE;
+                       }
+               } else {
+                       source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC);
                }
        }
 
-       newdoc = dom_document_parser(id, mode, source, options TSRMLS_CC);
+       newdoc = dom_document_parser(id, mode, source.s, options TSRMLS_CC);
 
        if (source_type == IS_UNICODE) {
-               efree(source);
+               efree(source.s);
        }
 
        if (!newdoc)
@@ -1860,13 +1860,13 @@ PHP_FUNCTION(dom_document_validate)
  
 
 #if defined(LIBXML_SCHEMAS_ENABLED)
-static void
-_dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
+static void _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
 {
        zval *id;
        xmlDoc *docp;
        dom_object *intern;
-       char *source = NULL, *valid_file = NULL;
+       zstr source = NULL_ZSTR;
+       char *valid_file = NULL;
        int source_len = 0;
        xmlSchemaParserCtxtPtr  parser;
        xmlSchemaPtr            sptr;
@@ -1875,14 +1875,8 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
        char resolved_path[MAXPATHLEN + 1];
        zend_uchar source_type = IS_STRING;
 
-       if (type == DOM_LOAD_FILE) {
-               if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) {
-                       return;
-               }
-       } else {
-               if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "OS", &id, dom_document_class_entry, &source, &source_len) == FAILURE) {
-                       return;
-               }
+       if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) {
+               return;
        }
 
        if (source_len == 0) {
@@ -1895,15 +1889,15 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
        switch (type) {
        case DOM_LOAD_FILE:
                if (source_type == IS_UNICODE) {
-                       if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) {
+                       if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) {
                                RETURN_FALSE;
                        }
                }
 
-               valid_file = _dom_get_valid_file_path(source, resolved_path, MAXPATHLEN  TSRMLS_CC);
+               valid_file = _dom_get_valid_file_path(source.s, resolved_path, MAXPATHLEN  TSRMLS_CC);
                if (!valid_file) {
                        if (source_type == IS_UNICODE) {
-                               efree(source);
+                               efree(source.s);
                        }
                        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid Schema file source");
                        RETURN_FALSE;
@@ -1911,13 +1905,19 @@ _dom_document_schema_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
                parser = xmlSchemaNewParserCtxt(valid_file);
 
                if (source_type == IS_UNICODE) {
-                       efree(source);
+                       efree(source.s);
                }
                break;
        case DOM_LOAD_STRING:
-               parser = xmlSchemaNewMemParserCtxt(source, source_len);
+               if (source_type == IS_UNICODE) {
+                       source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC);
+               }
+               parser = xmlSchemaNewMemParserCtxt(source.s, source_len);
                /* If loading from memory, we need to set the base directory for the document 
                   but it is not apparent how to do that for schema's */
+               if (source_type == IS_UNICODE) {
+                       efree(source.s);
+               }
                break;
        default:
                return;
@@ -1976,7 +1976,8 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
        zval *id;
        xmlDoc *docp;
        dom_object *intern;
-       char *source = NULL, *valid_file = NULL;
+       zstr source = NULL_ZSTR;
+       char *valid_file = NULL;
        int source_len = 0;
        xmlRelaxNGParserCtxtPtr parser;
        xmlRelaxNGPtr           sptr;
@@ -1985,14 +1986,8 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
        char resolved_path[MAXPATHLEN + 1];
        zend_uchar source_type = IS_STRING;
 
-       if (type == DOM_LOAD_FILE) {
-               if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) {
-                       return;
-               }
-       } else {
-               if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "OS", &id, dom_document_class_entry, &source, &source_len) == FAILURE) {
-                       return;
-               }
+       if (zend_parse_method_parameters(ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "Ot", &id, dom_document_class_entry, &source, &source_len, &source_type) == FAILURE) {
+               return;
        }
 
        if (source_len == 0) {
@@ -2005,27 +2000,33 @@ _dom_document_relaxNG_validate(INTERNAL_FUNCTION_PARAMETERS, int type)
        switch (type) {
        case DOM_LOAD_FILE:
                if (source_type == IS_UNICODE) {
-                       if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) {
+                       if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) {
                                RETURN_FALSE;
                        }
                }
-               valid_file = _dom_get_valid_file_path(source, resolved_path, MAXPATHLEN  TSRMLS_CC);
+               valid_file = _dom_get_valid_file_path(source.s, resolved_path, MAXPATHLEN  TSRMLS_CC);
                if (!valid_file) {
                        if (source_type == IS_UNICODE) {
-                               efree(source);
+                               efree(source.s);
                        }
                        php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid RelaxNG file source");
                        RETURN_FALSE;
                }
                parser = xmlRelaxNGNewParserCtxt(valid_file);
                if (source_type == IS_UNICODE) {
-                       efree(source);
+                       efree(source.s);
                }
                break;
        case DOM_LOAD_STRING:
-               parser = xmlRelaxNGNewMemParserCtxt(source, source_len);
+               if (source_type == IS_UNICODE) {
+                       source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC);
+               }
+               parser = xmlRelaxNGNewMemParserCtxt(source.s, source_len);
                /* If loading from memory, we need to set the base directory for the document 
                   but it is not apparent how to do that for schema's */
+               if (source_type == IS_UNICODE) {
+                       efree(source.s);
+               }
                break;
        default:
                return;
@@ -2087,21 +2088,15 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode)
        xmlDoc *docp = NULL, *newdoc;
        dom_object *intern;
        dom_doc_propsptr doc_prop;
-       char *source;
+       zstr source;
        int source_len, refcount, ret;
        htmlParserCtxtPtr ctxt;
        zend_uchar source_type = IS_STRING;
 
        id = getThis();
 
-       if (mode == DOM_LOAD_FILE) {
-               if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &source, &source_len, &source_type) == FAILURE) {
-                       return;
-               }
-       } else {
-               if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S", &source, &source_len) == FAILURE) {
-                       return;
-               }
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t", &source, &source_len, &source_type) == FAILURE) {
+               return;
        }
 
        if (!source_len) {
@@ -2111,19 +2106,26 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode)
 
        if (mode == DOM_LOAD_FILE) {
                if (source_type == IS_UNICODE) {
-                       if (php_stream_path_encode(NULL, &source, &source_len, (UChar*)source, source_len, REPORT_ERRORS, NULL) == FAILURE) {
+                       if (php_stream_path_encode(NULL, &source.s, &source_len, source.u, source_len, REPORT_ERRORS, NULL) == FAILURE) {
                                RETURN_FALSE;
                        }
                }
 
-               ctxt = htmlCreateFileParserCtxt(source, NULL);
+               ctxt = htmlCreateFileParserCtxt(source.s, NULL);
 
                if (source_type == IS_UNICODE) {
-                       efree(source);
+                       efree(source.s);
                }
        } else {
-               source_len = xmlStrlen(source);
-               ctxt = htmlCreateMemoryParserCtxt(source, source_len);
+               if (source_type == IS_UNICODE) {
+                       source.s = php_libxml_unicode_to_string(source.u, source_len, &source_len TSRMLS_CC);
+               }
+
+               ctxt = htmlCreateMemoryParserCtxt(source.s, source_len);
+
+               if (source_type == IS_UNICODE) {
+                       efree(source.s);
+               }
        }
 
        if (!ctxt) {
diff --git a/ext/dom/tests/dom002u.phpt b/ext/dom/tests/dom002u.phpt
new file mode 100755 (executable)
index 0000000..c6576b6
--- /dev/null
@@ -0,0 +1,82 @@
+--TEST--
+Test 2u: getElementsByTagName() / getElementsByTagNameNS()
+--SKIPIF--
+<?php require_once('skipif.inc'); ?>
+--FILE--
+<?php
+$xml = <<<HERE
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<foo xmlns="http://www.example.com/ns/foo"
+     xmlns:fubar="http://www.example.com/ns/fubar">
+  <bar><test1 /></bar>
+  <bar><test2 /></bar>
+  <fubar:bar><test3 /></fubar:bar>
+  <fubar:bar><test4 /></fubar:bar>
+</foo>
+HERE;
+
+function dump($elems) {
+       foreach ($elems as $elem) {
+               var_dump($elem->nodeName);
+               dump($elem->childNodes);
+       }
+}
+
+$dom = new DOMDocument();
+$dom->loadXML($xml);
+$doc = $dom->documentElement;
+dump($dom->getElementsByTagName('bar'));
+dump($doc->getElementsByTagName('bar'));
+dump($dom->getElementsByTagNameNS('http://www.example.com/ns/fubar', 'bar'));
+dump($doc->getElementsByTagNameNS('http://www.example.com/ns/fubar', 'bar'));
+?>
+--EXPECT--
+string(3) "bar"
+string(5) "test1"
+string(3) "bar"
+string(5) "test2"
+string(9) "fubar:bar"
+string(5) "test3"
+string(9) "fubar:bar"
+string(5) "test4"
+string(3) "bar"
+string(5) "test1"
+string(3) "bar"
+string(5) "test2"
+string(9) "fubar:bar"
+string(5) "test3"
+string(9) "fubar:bar"
+string(5) "test4"
+string(9) "fubar:bar"
+string(5) "test3"
+string(9) "fubar:bar"
+string(5) "test4"
+string(9) "fubar:bar"
+string(5) "test3"
+string(9) "fubar:bar"
+string(5) "test4"
+--UEXPECT--
+unicode(3) "bar"
+unicode(5) "test1"
+unicode(3) "bar"
+unicode(5) "test2"
+unicode(9) "fubar:bar"
+unicode(5) "test3"
+unicode(9) "fubar:bar"
+unicode(5) "test4"
+unicode(3) "bar"
+unicode(5) "test1"
+unicode(3) "bar"
+unicode(5) "test2"
+unicode(9) "fubar:bar"
+unicode(5) "test3"
+unicode(9) "fubar:bar"
+unicode(5) "test4"
+unicode(9) "fubar:bar"
+unicode(5) "test3"
+unicode(9) "fubar:bar"
+unicode(5) "test4"
+unicode(9) "fubar:bar"
+unicode(5) "test3"
+unicode(9) "fubar:bar"
+unicode(5) "test4"
index e69f504a4f3167bf8e2970d2808963a019989430..aacc467eae66db77c6852d16324602bee32b4433 100644 (file)
@@ -1059,6 +1059,107 @@ void php_libxml_node_decrement_resource(php_libxml_node_object *object TSRMLS_DC
 }
 /* }}} */
 
+PHP_LIBXML_API char* php_libxml_unicode_to_string(UChar *ustr, int ustr_len, int *str_len TSRMLS_DC)
+{
+       UErrorCode errCode = 0;
+       char *tmp;
+       int tmp_len;
+
+       zend_unicode_to_string_ex(UG(utf8_conv), &tmp, &tmp_len, ustr, ustr_len, &errCode);
+       *str_len = tmp_len;
+
+       /* Substitute uncoding with "utf8" */
+       if (tmp[0] == '<' &&
+           tmp[1] == '?' &&
+           tmp[2] == 'x' &&
+           tmp[3] == 'm' &&
+           tmp[4] == 'l') {
+               char *s = tmp + sizeof("<?xml")-1;
+
+               while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') {
+                       ++s;
+               }
+               while (*s != 0 && *s != '?' && *s != '>') {
+                       if ((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z')) {
+                               char *attr = s;
+                               char *val;
+                               int attr_len, val_len;
+
+                               while ((*s >= 'a' && *s <= 'z') ||
+                                      (*s >= 'A' && *s <= 'Z') ||
+                                      (*s >= '0' && *s <= '9') ||
+                                      (*s == '_')) {
+                                       ++s;
+                               }
+                               attr_len = s - attr;
+                               while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') {
+                                       ++s;
+                               }
+                               if (*s == '=') {
+                                       ++s;
+                               } else {
+                                       break;
+                               }
+                               while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') {
+                                       ++s;
+                               }
+                               if (*s == '"') {
+                                       ++s;
+                               } else {
+                                       break;
+                               }
+                               val = s;
+                               while (*s != 0 && *s != '"') {
+                                       ++s;
+                               }
+                               if (*s == '"') {
+                                       val_len = s - val;
+                                       ++s;
+                               } else {
+                                       break;
+                               }
+
+                               while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n') {
+                                       ++s;
+                               }
+
+                               if (attr_len == sizeof("encoding")-1 &&
+                                   strncasecmp(attr, "encoding", sizeof("encoding")-1) == 0) {
+                                   if (val_len >= sizeof("utf-8")-1) {
+                                       val[0] = 'u';
+                                       val[1] = 't';
+                                       val[2] = 'f';
+                                       val[3] = '-';
+                                       val[4] = '8';
+                                       val[5] = '"';
+                                       while (val_len > sizeof("utf-8")-1) {
+                                               val[val_len] = ' ';
+                                               --val_len;
+                                       }
+                                   }else if (val_len >= sizeof("utf8")-1) {
+                                       val[0] = 'u';
+                                       val[1] = 't';
+                                       val[2] = 'f';
+                                       val[3] = '8';
+                                       val[4] = '"';
+                                       while (val_len > sizeof("utf8")-1) {
+                                               val[val_len] = ' ';
+                                               --val_len;
+                                       }
+                                   } else {
+                                       /* Encoding name too short */
+                                       break;
+                                   }
+                               }
+
+                       } else {
+                               break;
+                       }
+               }
+       }
+       return tmp;
+}
+
 #ifdef PHP_WIN32
 PHP_LIBXML_API BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
 {
index fc4ca2e11cd0e0c4ca646a8e614306d9371cfef0..d8145a2a1095c1b9491110e987a60a5628994eb6 100644 (file)
@@ -93,6 +93,7 @@ void php_libxml_ctx_error(void *ctx, const char *msg, ...);
 PHP_LIBXML_API int php_libxml_xmlCheckUTF8(const unsigned char *s);
 PHP_LIBXML_API zval *php_libxml_switch_context(zval *context TSRMLS_DC);
 PHP_LIBXML_API void php_libxml_issue_error(int level, const char *msg TSRMLS_DC);
+PHP_LIBXML_API char* php_libxml_unicode_to_string(UChar *ustr, int ustr_len, int *str_len TSRMLS_DC);
 
 /* Init/shutdown functions*/
 PHP_LIBXML_API void php_libxml_initialize();
index f87c3c8b6a8f5eea423ec4ca2877a13f0ba052b6..15fbf456f1484f574523a487035618ba378b1621 100644 (file)
@@ -30,6 +30,7 @@
 #include "php_ini.h"
 #include "ext/standard/info.h"
 #include "ext/standard/php_string.h"
+#include "ext/libxml/php_libxml.h"
 #include "php_simplexml.h"
 #include "php_simplexml_exports.h"
 #include "zend_exceptions.h"
@@ -2059,8 +2060,9 @@ PHP_FUNCTION(simplexml_load_file)
 PHP_FUNCTION(simplexml_load_string)
 {
        php_sxe_object *sxe;
-       char           *data;
+       zstr            data;
        int             data_len;
+       zend_uchar      data_type;
        xmlDocPtr       docp;
        char           *ns = NULL;
        int             ns_len = 0;
@@ -2068,11 +2070,17 @@ PHP_FUNCTION(simplexml_load_string)
        zend_class_entry *ce= sxe_class_entry;
        zend_bool       isprefix = 0;
 
-       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "S|C!ls&b", &data, &data_len, &ce, &options, &ns, &ns_len, UG(utf8_conv), &isprefix) == FAILURE) {
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|C!ls&b", &data, &data_len, &data_type, &ce, &options, &ns, &ns_len, UG(utf8_conv), &isprefix) == FAILURE) {
                return;
        }
 
-       docp = xmlReadMemory(data, data_len, NULL, NULL, options);
+       if (data_type == IS_UNICODE) {
+               data.s = php_libxml_unicode_to_string(data.u, data_len, &data_len TSRMLS_CC);
+       }
+       docp = xmlReadMemory(data.s, data_len, NULL, NULL, options);
+       if (data_type == IS_UNICODE) {
+               efree(data.s);
+       }
 
        if (! docp) {
                RETURN_FALSE;
index c834cf6c9d711db5620a6826096a568b56ecad08..5f48d38053c147c317e486da28774c75d8c0ba0e 100644 (file)
@@ -296,12 +296,9 @@ ZEND_GET_MODULE(soap)
 
 char* soap_unicode_to_string(UChar *ustr, int ustr_len TSRMLS_DC)
 {
-       UErrorCode errCode = 0;
-       char *tmp;
-       int tmp_len;
+       int dummy;
 
-       zend_unicode_to_string_ex(UG(utf8_conv), &tmp, &tmp_len, ustr, ustr_len, &errCode);
-       return tmp;
+       return php_libxml_unicode_to_string(ustr, ustr_len, &dummy TSRMLS_CC);
 }
 
 void soap_decode_string(zval *ret, char* str TSRMLS_DC)
index e23c6b96d19989d9aa725083d86b67fa554021b3..d3609d791b7c14b8186c0f6e5d52c7ac6b5a1174 100755 (executable)
@@ -32,10 +32,10 @@ class LocalSoapClient extends SoapClient {
 $x = new LocalSoapClient(NULL,array('location'=>'test://', 
                                    'uri'=>'http://testuri.org',
                                    "trace"=>1)); 
-setlocale(LC_ALL,"sv_SE","sv_SE.ISO8859-1");
+@setlocale(LC_ALL,"sv_SE","sv_SE.ISO8859-1");
 var_dump($x->test());
 echo $x->__getLastResponse();
-setlocale(LC_ALL,"en_US","en_US.ISO8859-1");
+@setlocale(LC_ALL,"en_US","en_US.ISO8859-1");
 var_dump($x->test());
 echo $x->__getLastResponse();
 --EXPECT--