]> granicus.if.org Git - python/commitdiff
urllib: Simplify splithost by calling into urlparse. (#1849)
authorpostmasters <namnguyen@google.com>
Tue, 20 Jun 2017 13:02:44 +0000 (06:02 -0700)
committerVictor Stinner <victor.stinner@gmail.com>
Tue, 20 Jun 2017 13:02:44 +0000 (15:02 +0200)
The current regex based splitting produces a wrong result. For example::

  http://abc#@def

Web browsers parse that URL as ``http://abc/#@def``, that is, the host
is ``abc``, the path is ``/``, and the fragment is ``#@def``.

Lib/test/test_urlparse.py
Lib/urllib/parse.py
Misc/ACKS
Misc/NEWS

index 3c89ed928ff51fd83dac1f2140965079f931e219..e5f6130e4a075e4e0b552bb03740f9f7f7925531 100644 (file)
@@ -755,28 +755,35 @@ class UrlParseTestCase(unittest.TestCase):
     def test_parse_fragments(self):
         # Exercise the allow_fragments parameter of urlparse() and urlsplit()
         tests = (
-            ("http:#frag", "path"),
-            ("//example.net#frag", "path"),
-            ("index.html#frag", "path"),
-            (";a=b#frag", "params"),
-            ("?a=b#frag", "query"),
-            ("#frag", "path"),
+            ("http:#frag", "path", "frag"),
+            ("//example.net#frag", "path", "frag"),
+            ("index.html#frag", "path", "frag"),
+            (";a=b#frag", "params", "frag"),
+            ("?a=b#frag", "query", "frag"),
+            ("#frag", "path", "frag"),
+            ("abc#@frag", "path", "@frag"),
+            ("//abc#@frag", "path", "@frag"),
+            ("//abc:80#@frag", "path", "@frag"),
+            ("//abc#@frag:80", "path", "@frag:80"),
         )
-        for url, attr in tests:
+        for url, attr, expected_frag in tests:
             for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
                 if attr == "params" and func is urllib.parse.urlsplit:
                     attr = "path"
                 with self.subTest(url=url, function=func):
                     result = func(url, allow_fragments=False)
                     self.assertEqual(result.fragment, "")
-                    self.assertTrue(getattr(result, attr).endswith("#frag"))
+                    self.assertTrue(
+                            getattr(result, attr).endswith("#" + expected_frag))
                     self.assertEqual(func(url, "", False).fragment, "")
 
                     result = func(url, allow_fragments=True)
-                    self.assertEqual(result.fragment, "frag")
-                    self.assertFalse(getattr(result, attr).endswith("frag"))
-                    self.assertEqual(func(url, "", True).fragment, "frag")
-                    self.assertEqual(func(url).fragment, "frag")
+                    self.assertEqual(result.fragment, expected_frag)
+                    self.assertFalse(
+                            getattr(result, attr).endswith(expected_frag))
+                    self.assertEqual(func(url, "", True).fragment,
+                                     expected_frag)
+                    self.assertEqual(func(url).fragment, expected_frag)
 
     def test_mixed_types_rejected(self):
         # Several functions that process either strings or ASCII encoded bytes
@@ -983,6 +990,26 @@ class Utility_Tests(unittest.TestCase):
         self.assertEqual(splithost('/foo/bar/baz.html'),
                          (None, '/foo/bar/baz.html'))
 
+        # bpo-30500: # starts a fragment.
+        self.assertEqual(splithost('//127.0.0.1#@host.com'),
+                         ('127.0.0.1', '/#@host.com'))
+        self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
+                         ('127.0.0.1', '/#@host.com:80'))
+        self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
+                         ('127.0.0.1:80', '/#@host.com'))
+
+        # Empty host is returned as empty string.
+        self.assertEqual(splithost("///file"),
+                         ('', '/file'))
+
+        # Trailing semicolon, question mark and hash symbol are kept.
+        self.assertEqual(splithost("//example.net/file;"),
+                         ('example.net', '/file;'))
+        self.assertEqual(splithost("//example.net/file?"),
+                         ('example.net', '/file?'))
+        self.assertEqual(splithost("//example.net/file#"),
+                         ('example.net', '/file#'))
+
     def test_splituser(self):
         splituser = urllib.parse.splituser
         self.assertEqual(splituser('User:Pass@www.python.org:080'),
index 1af2906e36bfdd2f253a887fbd7087721cf100fe..01eb54906c8a53b6e9e18ea3d1e0cf10c6b84c59 100644 (file)
@@ -947,7 +947,7 @@ def splithost(url):
     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
     global _hostprog
     if _hostprog is None:
-        _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
 
     match = _hostprog.match(url)
     if match:
index 37905a1dbc0a4eb728c51260926185998db23b39..4f98e980bd971fb7729f1e4da8f9bf2ec4f53072 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1091,6 +1091,7 @@ Max Neunhöffer
 Anthon van der Neut
 George Neville-Neil
 Hieu Nguyen
+Nam Nguyen
 Johannes Nicolai
 Samuel Nicolary
 Jonathan Niehof
index 0d1ed64cd62e262c5012a360f152d4d3e056a2f3..47f3c379049b8fef4d166496ba406f9d6261b775 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,7 +12,7 @@ Core and Builtins
 
 - bpo-30682: Removed a too-strict assertion that failed for certain f-strings,
   such as eval("f'\\\n'") and eval("f'\\\r'").
-  
+
 - bpo-30501: The compiler now produces more optimal code for complex condition
   expressions in the "if", "while" and "assert" statement, the "if" expression,
   and generator expressions and comprehensions.
@@ -365,6 +365,11 @@ Extension Modules
 Library
 -------
 
+- [Security] bpo-30500: Fix urllib.parse.splithost() to correctly parse
+  fragments. For example, ``splithost('http://127.0.0.1#@evil.com/')`` now
+  correctly returns the ``127.0.0.1`` host, instead of treating ``@evil.com``
+  as the host in an authentification (``login@host``).
+
 - bpo-30038: Fix race condition between signal delivery and wakeup file
   descriptor.  Patch by Nathaniel Smith.