]> granicus.if.org Git - python/commitdiff
Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution of...
authorAntoine Pitrou <solipsis@pitrou.net>
Thu, 21 Aug 2014 23:16:17 +0000 (19:16 -0400)
committerAntoine Pitrou <solipsis@pitrou.net>
Thu, 21 Aug 2014 23:16:17 +0000 (19:16 -0400)
Patch by Demian Brecht.

Doc/library/urllib.parse.rst
Lib/test/test_urlparse.py
Lib/urllib/parse.py
Misc/NEWS

index b95142042d87c37604a02fa9192616561704e201..3675305cd62f46ff95c75a2c2d26b52200347eaf 100644 (file)
@@ -267,6 +267,11 @@ or on combining URL components into a URL string.
    :func:`urlunsplit`, removing possible *scheme* and *netloc* parts.
 
 
+   .. versionchanged:: 3.5
+
+      Behaviour updated to match the semantics defined in :rfc:`3986`.
+
+
 .. function:: urldefrag(url)
 
    If *url* contains a fragment identifier, return a modified version of *url*
index 393481148d2e0e868efb3f3d4440c4579e58cfc1..24c1856fbc15a145a34d1403b771577190cbd090 100644 (file)
@@ -211,10 +211,6 @@ class UrlParseTestCase(unittest.TestCase):
 
         # "abnormal" cases from RFC 1808:
         self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
-        self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
-        self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
-        self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
-        self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
         self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
         self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
         self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
@@ -229,6 +225,13 @@ class UrlParseTestCase(unittest.TestCase):
         #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
         #self.checkJoin(RFC1808_BASE, 'http:', 'http:')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
+        # self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
+        # self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
+        # self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
+
+
     def test_RFC2368(self):
         # Issue 11467: path that starts with a number is not parsed correctly
         self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'),
@@ -259,10 +262,6 @@ class UrlParseTestCase(unittest.TestCase):
         self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
         self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
         self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
-        self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
-        self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
-        self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
-        self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
         self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
         self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
         self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
@@ -278,10 +277,17 @@ class UrlParseTestCase(unittest.TestCase):
         self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
         self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
+        # self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
+        # self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
+        # self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
+
+
     def test_RFC3986(self):
         # Test cases from RFC3986
         self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
-        self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
+        self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x')
         self.checkJoin(RFC3986_BASE, 'g:h','g:h')
         self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g')
         self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g')
@@ -305,17 +311,17 @@ class UrlParseTestCase(unittest.TestCase):
         self.checkJoin(RFC3986_BASE, '../..','http://a/')
         self.checkJoin(RFC3986_BASE, '../../','http://a/')
         self.checkJoin(RFC3986_BASE, '../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g')
 
         #Abnormal Examples
 
         # The 'abnormal scenarios' are incompatible with RFC2986 parsing
         # Tests are here for reference.
 
-        #self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
-
+        self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
         self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.')
         self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g')
         self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..')
@@ -355,10 +361,8 @@ class UrlParseTestCase(unittest.TestCase):
         self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g')
         self.checkJoin(SIMPLE_BASE, '../..','http://a/')
         self.checkJoin(SIMPLE_BASE, '../../g','http://a/g')
-        self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
         self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g')
         self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/')
-        self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
         self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
         self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
         self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
@@ -372,6 +376,10 @@ class UrlParseTestCase(unittest.TestCase):
         self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2')
         self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
+        # self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
+
     def test_RFC2732(self):
         str_cases = [
             ('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
index dfb947c32017916d5d8b59cbaab15a5735127fba..b6ac414dfdfb5e0da036641f25583461c278f86c 100644 (file)
@@ -409,11 +409,13 @@ def urljoin(base, url, allow_fragments=True):
         return url
     if not url:
         return base
+
     base, url, _coerce_result = _coerce_args(base, url)
     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
             urlparse(base, '', allow_fragments)
     scheme, netloc, path, params, query, fragment = \
             urlparse(url, bscheme, allow_fragments)
+
     if scheme != bscheme or scheme not in uses_relative:
         return _coerce_result(url)
     if scheme in uses_netloc:
@@ -421,9 +423,7 @@ def urljoin(base, url, allow_fragments=True):
             return _coerce_result(urlunparse((scheme, netloc, path,
                                               params, query, fragment)))
         netloc = bnetloc
-    if path[:1] == '/':
-        return _coerce_result(urlunparse((scheme, netloc, path,
-                                          params, query, fragment)))
+
     if not path and not params:
         path = bpath
         params = bparams
@@ -431,29 +431,42 @@ def urljoin(base, url, allow_fragments=True):
             query = bquery
         return _coerce_result(urlunparse((scheme, netloc, path,
                                           params, query, fragment)))
-    segments = bpath.split('/')[:-1] + path.split('/')
-    # XXX The stuff below is bogus in various ways...
-    if segments[-1] == '.':
-        segments[-1] = ''
-    while '.' in segments:
-        segments.remove('.')
-    while 1:
-        i = 1
-        n = len(segments) - 1
-        while i < n:
-            if (segments[i] == '..'
-                and segments[i-1] not in ('', '..')):
-                del segments[i-1:i+1]
-                break
-            i = i+1
+
+    base_parts = bpath.split('/')
+    if base_parts[-1] != '':
+        # the last item is not a directory, so will not be taken into account
+        # in resolving the relative path
+        del base_parts[-1]
+
+    # for rfc3986, ignore all base path should the first character be root.
+    if path[:1] == '/':
+        segments = path.split('/')
+    else:
+        segments = base_parts + path.split('/')
+
+    resolved_path = []
+
+    for seg in segments:
+        if seg == '..':
+            try:
+                resolved_path.pop()
+            except IndexError:
+                # ignore any .. segments that would otherwise cause an IndexError
+                # when popped from resolved_path if resolving for rfc3986
+                pass
+        elif seg == '.':
+            continue
         else:
-            break
-    if segments == ['', '..']:
-        segments[-1] = ''
-    elif len(segments) >= 2 and segments[-1] == '..':
-        segments[-2:] = ['']
-    return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
-                                      params, query, fragment)))
+            resolved_path.append(seg)
+
+    if segments[-1] in ('.', '..'):
+        # do some post-processing here. if the last segment was a relative dir,
+        # then we need to append the trailing '/'
+        resolved_path.append('')
+
+    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
+        resolved_path), params, query, fragment)))
+
 
 def urldefrag(url):
     """Removes any existing fragment from URL.
index 5ac80b822ac4b30177762d0f922bf87d0542d7ea..bdc53c6a7640515033d39cb220e7abad85540def 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -124,6 +124,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the
+  resolution of relative URLs, rather than RFCs 1808 and 2396.
+  Patch by Demian Brecht.
+
 - Issue #21549: Added the "members" parameter to TarFile.list().
 
 - Issue #19628: Allow compileall recursion depth to be specified with a -r