bpo-30458: Disallow control chars in http URLs. (GH-12755)

author Gregory P. Smith <greg@krypto.org>

Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)

committer GitHub <noreply@github.com>

Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)
author Gregory P. Smith <greg@krypto.org>
Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)
committer GitHub <noreply@github.com>
Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)
diff --git a/Lib/http/client.py b/Lib/http/client.py

index 5a2225276b1acd630de0bcd2df249edcfa6c8d0e..99d6a68cf42823a21708075fc9e2ef33450526f0 100644 (file)
--- a/Lib/http/client.py
+++ b/Lib/http/client.py
@@ -137,6 +137,16 @@ _MAXHEADERS = 100
  _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch
  _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search
  
+# These characters are not allowed within HTTP URL paths.
+#  See https://tools.ietf.org/html/rfc3986#section-3.3 and the
+#  https://tools.ietf.org/html/rfc3986#appendix-A pchar definition.
+# Prevents CVE-2019-9740.  Includes control characters such as \r\n.
+# We don't restrict chars above \x7f as putrequest() limits us to ASCII.
+_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]')
+# Arguably only these _should_ allowed:
+#  _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$")
+# We are more lenient for assumed real world compatibility purposes.
+
  # We always set the Content-Length header for these methods because some
  # servers will otherwise respond with a 411
  _METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'}
@@ -1079,6 +1089,10 @@ class HTTPConnection:
          self._method = method
          if not url:
              url = '/'
+        # Prevent CVE-2019-9740.
+        if match := _contains_disallowed_url_pchar_re.search(url):
+            raise ValueError(f"URL can't contain control characters. {url!r} "
+                             f"(found at least {match.group()!r})")
          request = '%s %s %s' % (method, url, self._http_vsn_str)
  
          # Non-ASCII characters should have been eliminated earlier
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py

index 2ac73b58d832064f8328d6d06eb90135056c7c59..e87c85b92876abb8242c11889fb3d87d70e81d85 100644 (file)
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -329,6 +329,55 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin, FakeFTPMixin):
          finally:
              self.unfakehttp()
  
+    def test_url_with_control_char_rejected(self):
+        for char_no in list(range(0, 0x21)) + [0x7f]:
+            char = chr(char_no)
+            schemeless_url = f"//localhost:7777/test{char}/"
+            self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
+            try:
+                # We explicitly test urllib.request.urlopen() instead of the top
+                # level 'def urlopen()' function defined in this... (quite ugly)
+                # test suite.  They use different url opening codepaths.  Plain
+                # urlopen uses FancyURLOpener which goes via a codepath that
+                # calls urllib.parse.quote() on the URL which makes all of the
+                # above attempts at injection within the url _path_ safe.
+                escaped_char_repr = repr(char).replace('\\', r'\\')
+                with self.assertRaisesRegex(
+                    ValueError, f"contain control.*{escaped_char_repr}"):
+                    urllib.request.urlopen(f"http:{schemeless_url}")
+                with self.assertRaisesRegex(
+                    ValueError, f"contain control.*{escaped_char_repr}"):
+                    urllib.request.urlopen(f"https:{schemeless_url}")
+                # This code path quotes the URL so there is no injection.
+                resp = urlopen(f"http:{schemeless_url}")
+                self.assertNotIn(char, resp.geturl())
+            finally:
+                self.unfakehttp()
+
+    def test_url_with_newline_header_injection_rejected(self):
+        self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
+        host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123"
+        schemeless_url = "//" + host + ":8080/test/?test=a"
+        try:
+            # We explicitly test urllib.request.urlopen() instead of the top
+            # level 'def urlopen()' function defined in this... (quite ugly)
+            # test suite.  They use different url opening codepaths.  Plain
+            # urlopen uses FancyURLOpener which goes via a codepath that
+            # calls urllib.parse.quote() on the URL which makes all of the
+            # above attempts at injection within the url _path_ safe.
+            with self.assertRaisesRegex(
+                ValueError, r"contain control.*\\r.*(found at least . .)"):
+                urllib.request.urlopen(f"http:{schemeless_url}")
+            with self.assertRaisesRegex(ValueError, r"contain control.*\\n"):
+                urllib.request.urlopen(f"https:{schemeless_url}")
+            # This code path quotes the URL so there is no injection.
+            resp = urlopen(f"http:{schemeless_url}")
+            self.assertNotIn(' ', resp.geturl())
+            self.assertNotIn('\r', resp.geturl())
+            self.assertNotIn('\n', resp.geturl())
+        finally:
+            self.unfakehttp()
+
      def test_read_0_9(self):
          # "0.9" response accepted (but not "simple responses" without
          # a status line)
diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py

index 9c8b6958c620ce44461262bcb00e1169e60ca1dd..52bacc1eafa7895d2d4757272040d2539a11b61e 100644 (file)
--- a/Lib/test/test_xmlrpc.py
+++ b/Lib/test/test_xmlrpc.py
@@ -943,8 +943,13 @@ class SimpleServerTestCase(BaseServerTestCase):
  
      def test_partial_post(self):
          # Check that a partial POST doesn't make the server loop: issue #14001.
-        with contextlib.closing(http.client.HTTPConnection(ADDR, PORT)) as conn:
-            conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye')
+        with contextlib.closing(socket.create_connection((ADDR, PORT))) as conn:
+            conn.send('POST /RPC2 HTTP/1.0\r\n'
+                      'Content-Length: 100\r\n\r\n'
+                      'bye HTTP/1.1\r\n'
+                      f'Host: {ADDR}:{PORT}\r\n'
+                      'Accept-Encoding: identity\r\n'
+                      'Content-Length: 0\r\n\r\n'.encode('ascii'))
  
      def test_context_manager(self):
          with xmlrpclib.ServerProxy(URL) as server:
diff --git a/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst

new file mode 100644 (file)

index 0000000..4fed4d5
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst
@@ -0,0 +1 @@
+Address CVE-2019-9740 by disallowing URL paths with embedded whitespace or control characters through into the underlying http client request.  Such potentially malicious header injection URLs now cause a ValueError to be raised.
+\ No newline at end of file
author	Gregory P. Smith <greg@krypto.org>
	Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)
committer	GitHub <noreply@github.com>
	Wed, 1 May 2019 02:12:21 +0000 (19:12 -0700)
Lib/http/client.py		patch \| blob \| history
Lib/test/test_urllib.py		patch \| blob \| history
Lib/test/test_xmlrpc.py		patch \| blob \| history
Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst	[new file with mode: 0644]	patch \| blob