bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2294)
authorVictor Stinner <victor.stinner@gmail.com>
Tue, 20 Jun 2017 14:20:36 +0000 (16:20 +0200)
committerGitHub <noreply@github.com>
Tue, 20 Jun 2017 14:20:36 +0000 (16:20 +0200)
The current regex based splitting produces a wrong result. For example::

  http://abc#@def

Web browsers parse that URL as ``http://abc/#@def``, that is, the host
is ``abc``, the path is ``/``, and the fragment is ``#@def``.
(cherry picked from commit 90e01e50ef8a9e6c91f30d965563c378a4ad26de)

Lib/test/test_urllib.py
Lib/urllib.py
Misc/ACKS
Misc/NEWS

index 14de91e13dad19e6262f578032087898d84836ee..1ce9201c06931602af1ca2c68a2a383149544c1c 100644 (file)
@@ -879,6 +879,26 @@ class Utility_Tests(unittest.TestCase):
         self.assertEqual(splithost('/foo/bar/baz.html'),
                          (None, '/foo/bar/baz.html'))
 
+        # bpo-30500: # starts a fragment.
+        self.assertEqual(splithost('//127.0.0.1#@host.com'),
+                         ('127.0.0.1', '/#@host.com'))
+        self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
+                         ('127.0.0.1', '/#@host.com:80'))
+        self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
+                         ('127.0.0.1:80', '/#@host.com'))
+
+        # Empty host is returned as empty string.
+        self.assertEqual(splithost("///file"),
+                         ('', '/file'))
+
+        # Trailing semicolon, question mark and hash symbol are kept.
+        self.assertEqual(splithost("//example.net/file;"),
+                         ('example.net', '/file;'))
+        self.assertEqual(splithost("//example.net/file?"),
+                         ('example.net', '/file?'))
+        self.assertEqual(splithost("//example.net/file#"),
+                         ('example.net', '/file#'))
+
     def test_splituser(self):
         splituser = urllib.splituser
         self.assertEqual(splituser('User:Pass@www.python.org:080'),
index c3c8ef4b60048410c207d209be64d364a1771c43..d85504a5cb7e93f330dbba242c116803c32bb7c5 100644 (file)
@@ -1093,8 +1093,7 @@ def splithost(url):
     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
     global _hostprog
     if _hostprog is None:
-        import re
-        _hostprog = re.compile('^//([^/?]*)(.*)$')
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
 
     match = _hostprog.match(url)
     if match:
index a411bc5ffc8f728d50d64df1dcd41f74473f6735..35c77b67a6c8ae9c6a4c9fbf505bdb29322d2ae4 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -993,6 +993,7 @@ Chad Netzer
 Max Neunhöffer
 George Neville-Neil
 Hieu Nguyen
+Nam Nguyen
 Johannes Nicolai
 Samuel Nicolary
 Jonathan Niehof
index 361a9d3c762abd1d019799ab5e66cdbf02b9e3c0..f85e829db10140da01e11a83727ee2e6cf8d889e 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -52,6 +52,11 @@ Extension Modules
 Library
 -------
 
+- [Security] bpo-30500: Fix urllib.splithost() to correctly parse
+  fragments. For example, ``splithost('//127.0.0.1#@evil.com/')`` now
+  correctly returns the ``127.0.0.1`` host, instead of treating ``@evil.com``
+  as the host in an authentification (``login@host``).
+
 - [Security] bpo-29591: Update expat copy from 2.1.1 to 2.2.0 to get fixes
   of CVE-2016-0718 and CVE-2016-4472. See
   https://sourceforge.net/p/expat/bugs/537/ for more information.