]> granicus.if.org Git - php/commitdiff
Fix #51903: simplexml_load_file() doesn't use HTTP headers
authorChristoph M. Becker <cmbecker69@gmx.de>
Wed, 3 Mar 2021 18:23:39 +0000 (19:23 +0100)
committerChristoph M. Becker <cmbecker69@gmx.de>
Mon, 8 Mar 2021 14:07:01 +0000 (15:07 +0100)
The `encoding` attribute of the XML declaration is optional; it is good
practice to use external encoding information where available if it is
missing.  Thus, we check for `charset` info of `Content-Type` headers,
and see whether the encoding is supported.

We cater to trailing parameters and quoted-strings, but not to escaped
backslashes and quotes in quoted-strings, since no known character
encoding contains these anyway.

Co-authored-by: Michael Wallner <mike@php.net>
Closes GH-6747.

NEWS
ext/libxml/libxml.c
ext/libxml/tests/bug51903.phpt [new file with mode: 0644]

diff --git a/NEWS b/NEWS
index 751b79be6939033542a47ab98271f88ea1ad1030..4ccac5378f8f9f0a6600a90f8549242c489f685b 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,9 @@ PHP                                                                        NEWS
   . Fixed bug #80763 (msgfmt_format() does not accept DateTime references).
     (cmb)
 
+- Libxml:
+  . Fixed bug #51903 (simplexml_load_file() doesn't use HTTP headers). (cmb)
+
 - MySQLnd:
   . Fixed bug #80713 (SegFault when disabling ATTR_EMULATE_PREPARES and
     MySQL 8.0). (Nikita)
index c024e1667025c86296ba0f9d3afdb85e5648e75e..e21d6fdbbe98cb6593ef0b423d7035cb638f7ba7 100644 (file)
@@ -409,6 +409,54 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc)
                return(NULL);
        }
 
+       /* Check if there's been an external transport protocol with an encoding information */
+       if (enc == XML_CHAR_ENCODING_NONE) {
+               php_stream *s  = (php_stream *) context;
+
+               if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
+                       zval *header;
+
+                       ZEND_HASH_FOREACH_VAL_IND(Z_ARRVAL(s->wrapperdata), header) {
+                               const char buf[] = "Content-Type:";
+                               if (Z_TYPE_P(header) == IS_STRING &&
+                                               !zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
+                                       char *needle = estrdup("charset=");
+                                       char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header));
+                                       char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), sizeof("charset=")-1);
+
+                                       if (encoding) {
+                                               char *end;
+                                               
+                                               encoding += sizeof("charset=")-1;
+                                               if (*encoding == '"') {
+                                                       encoding++;
+                                               }
+                                               end = strchr(encoding, ';');
+                                               if (end == NULL) {
+                                                       end = encoding + strlen(encoding);
+                                               }
+                                               end--; /* end == encoding-1 isn't a buffer underrun */
+                                               while (*end == ' ' || *end == '\t') {
+                                                       end--;
+                                               }
+                                               if (*end == '"') {
+                                                       end--;
+                                               }
+                                               if (encoding >= end) continue;
+                                               *(end+1) = '\0';
+                                               enc = xmlParseCharEncoding(encoding);
+                                               if (enc <= XML_CHAR_ENCODING_NONE) {
+                                                       enc = XML_CHAR_ENCODING_NONE;
+                                               }
+                                       }
+                                       efree(haystack);
+                                       efree(needle);
+                                       break; /* found content-type */
+                               }
+                       } ZEND_HASH_FOREACH_END();
+               }
+       }
+
        /* Allocate the Input buffer front-end. */
        ret = xmlAllocParserInputBuffer(enc);
        if (ret != NULL) {
diff --git a/ext/libxml/tests/bug51903.phpt b/ext/libxml/tests/bug51903.phpt
new file mode 100644 (file)
index 0000000..36a4b55
--- /dev/null
@@ -0,0 +1,38 @@
+--TEST--
+Bug #51903 (simplexml_load_file() doesn't use HTTP headers)
+--SKIPIF--
+<?php
+if (!extension_loaded('simplexml')) die('skip simplexml extension not available');
+if (@!include "./ext/standard/tests/http/server.inc") die('skip server.inc not available');
+http_server_skipif('tcp://127.0.0.1:12342');
+?>
+--FILE--
+<?php
+require "./ext/standard/tests/http/server.inc";
+$responses = [
+    "data://text/plain,HTTP/1.1 200 OK\r\n"
+    . "Content-Type: text/xml; charset=ISO-8859-1\r\n\r\n"
+    . "<?xml version=\"1.0\"?>\n"
+    . "<root>\xE4\xF6\xFC</root>\n",
+    "data://text/plain,HTTP/1.1 200 OK\r\n"
+    . "Content-Type: text/xml; charset=ISO-8859-1; foo=bar\r\n\r\n"
+    . "<?xml version=\"1.0\"?>\n"
+    . "<root>\xE4\xF6\xFC</root>\n",
+    "data://text/plain,HTTP/1.1 200 OK\r\n"
+    . "Content-Type: text/xml; charset=\"ISO-8859-1\" ; foo=bar\r\n\r\n"
+    . "<?xml version=\"1.0\"?>\n"
+    . "<root>\xE4\xF6\xFC</root>\n",
+];
+$pid = http_server('tcp://127.0.0.1:12342', $responses);
+
+for ($i = 0; $i < count($responses); $i++) {
+    $sxe = simplexml_load_file('http://127.0.0.1:12342/');
+    echo "$sxe\n";
+}
+
+http_server_kill($pid);
+?>
+--EXPECT--
+äöü
+äöü
+äöü