Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as

author Johannes Gijsbers <jlg@dds.nl>

Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)

committer Johannes Gijsbers <jlg@dds.nl>

Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)
author Johannes Gijsbers <jlg@dds.nl>
Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)
committer Johannes Gijsbers <jlg@dds.nl>
Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py

index 8932b3c7be2402c4e044e50e0743628625594040..04572ba60df9b36dba8faed54f7a71f8e2c76c30 100644 (file)
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -8,20 +8,22 @@ RFC1808_BASE = "http://a/b/c/d;p?q#f"
  RFC2396_BASE = "http://a/b/c/d;p?q"
  
  class UrlParseTestCase(unittest.TestCase):
-    def test_frags(self):
-        for url, parsed, split in [
-            ('http://www.python.org',
-             ('http', 'www.python.org', '', '', '', ''),
-             ('http', 'www.python.org', '', '', '')),
-            ('http://www.python.org#abc',
-             ('http', 'www.python.org', '', '', '', 'abc'),
-             ('http', 'www.python.org', '', '', 'abc')),
-            ('http://www.python.org/#abc',
-             ('http', 'www.python.org', '/', '', '', 'abc'),
-             ('http', 'www.python.org', '/', '', 'abc')),
-            (RFC1808_BASE,
-             ('http', 'a', '/b/c/d', 'p', 'q', 'f'),
-             ('http', 'a', '/b/c/d;p', 'q', 'f')),
+
+    def checkRoundtrips(self, url, parsed, split):
+        result = urlparse.urlparse(url)
+        self.assertEqual(result, parsed)
+        # put it back together and it should be the same
+        result2 = urlparse.urlunparse(result)
+        self.assertEqual(result2, url)
+
+        # check the roundtrip using urlsplit() as well
+        result = urlparse.urlsplit(url)
+        self.assertEqual(result, split)
+        result2 = urlparse.urlunsplit(result)
+        self.assertEqual(result2, url)
+
+    def test_roundtrips(self):
+        testcases = [
              ('file:///tmp/junk.txt',
               ('file', '', '/tmp/junk.txt', '', '', ''),
               ('file', '', '/tmp/junk.txt', '', '')),
@@ -29,20 +31,41 @@ class UrlParseTestCase(unittest.TestCase):
               ('imap', 'mail.python.org', '/mbox1', '', '', ''),
               ('imap', 'mail.python.org', '/mbox1', '', '')),
              ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
-             ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '', ''),
-             ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '')),
-            ]:
-            result = urlparse.urlparse(url)
-            self.assertEqual(result, parsed)
-            # put it back together and it should be the same
-            result2 = urlparse.urlunparse(result)
-            self.assertEqual(result2, url)
-
-            # check the roundtrip using urlsplit() as well
-            result = urlparse.urlsplit(url)
-            self.assertEqual(result, split)
-            result2 = urlparse.urlunsplit(result)
-            self.assertEqual(result2, url)
+             ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
+              '', '', ''),
+             ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
+              '', '')),
+            ]
+        for url, parsed, split in testcases:
+            self.checkRoundtrips(url, parsed, split)
+
+    def test_http_roundtrips(self):
+        # urlparse.urlsplit treats 'http:' as an optimized special case,
+        # so we test both 'http:' and 'https:' in all the following.
+        # Three cheers for white box knowledge!
+        testcases = [
+            ('://www.python.org',
+             ('www.python.org', '', '', '', ''),
+             ('www.python.org', '', '', '')),
+            ('://www.python.org#abc',
+             ('www.python.org', '', '', '', 'abc'),
+             ('www.python.org', '', '', 'abc')),
+            ('://www.python.org?q=abc',
+             ('www.python.org', '', '', 'q=abc', ''),
+             ('www.python.org', '', 'q=abc', '')),
+            ('://www.python.org/#abc',
+             ('www.python.org', '/', '', '', 'abc'),
+             ('www.python.org', '/', '', 'abc')),
+            ('://a/b/c/d;p?q#f',
+             ('a', '/b/c/d', 'p', 'q', 'f'),
+             ('a', '/b/c/d;p', 'q', 'f')),
+            ]
+        for scheme in ('http', 'https'):
+            for url, parsed, split in testcases:
+                url = scheme + url
+                parsed = (scheme,) + parsed
+                split = (scheme,) + split
+                self.checkRoundtrips(url, parsed, split)
  
      def checkJoin(self, base, relurl, expected):
          self.assertEqual(urlparse.urljoin(base, relurl), expected,
diff --git a/Lib/urlparse.py b/Lib/urlparse.py

index 9c762725474835e4354c09b065584a27daac983f..8469139344b614660c28b5b3ef79e73262775ee9 100644 (file)
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -63,6 +63,15 @@ def _splitparams(url):
          i = url.find(';')
      return url[:i], url[i+1:]
  
+def _splitnetloc(url, start=0):
+    for c in '/?#': # the order is important!
+        delim = url.find(c, start)
+        if delim >= 0:
+            break
+    else:
+        delim = len(url)
+    return url[start:delim], url[delim:]
+
  def urlsplit(url, scheme='', allow_fragments=1):
      """Parse a URL into 5 components:
      <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -82,13 +91,7 @@ def urlsplit(url, scheme='', allow_fragments=1):
              scheme = url[:i].lower()
              url = url[i+1:]
              if url[:2] == '//':
-                i = url.find('/', 2)
-                if i < 0:
-                    i = url.find('#')
-                    if i < 0:
-                        i = len(url)
-                netloc = url[2:i]
-                url = url[i:]
+                netloc, url = _splitnetloc(url, 2)
              if allow_fragments and '#' in url:
                  url, fragment = url.split('#', 1)
              if '?' in url:
@@ -101,12 +104,8 @@ def urlsplit(url, scheme='', allow_fragments=1):
                  break
          else:
              scheme, url = url[:i].lower(), url[i+1:]
-    if scheme in uses_netloc:
-        if url[:2] == '//':
-            i = url.find('/', 2)
-            if i < 0:
-                i = len(url)
-            netloc, url = url[2:i], url[i:]
+    if scheme in uses_netloc and url[:2] == '//':
+        netloc, url = _splitnetloc(url, 2)
      if allow_fragments and scheme in uses_fragment and '#' in url:
          url, fragment = url.split('#', 1)
      if scheme in uses_query and '?' in url:
author	Johannes Gijsbers <jlg@dds.nl>
	Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)
committer	Johannes Gijsbers <jlg@dds.nl>
	Sun, 9 Jan 2005 15:29:10 +0000 (15:29 +0000)
Lib/test/test_urlparse.py		patch \| blob \| history
Lib/urlparse.py		patch \| blob \| history