Issue #11467: Fix urlparse behavior when handling urls which contains scheme specific...

author Senthil Kumaran <orsenthil@gmail.com>

Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)

committer Senthil Kumaran <orsenthil@gmail.com>

Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)
author Senthil Kumaran <orsenthil@gmail.com>
Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)
committer Senthil Kumaran <orsenthil@gmail.com>
Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py

index 1c6c501d673a1a44c268240c0fa8738ea9967c65..252eb138d06f23cf61ab0c9284a7798d5986faeb 100644 (file)
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -197,6 +197,11 @@ class UrlParseTestCase(unittest.TestCase):
          #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
          #self.checkJoin(RFC1808_BASE, 'http:', 'http:')
  
+    def test_RFC2368(self):
+        # Issue 11467: path that starts with a number is not parsed correctly
+        self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'),
+                ('mailto', '', '1337@example.org', '', '', ''))
+
      def test_RFC2396(self):
          # cases from RFC 2396
          self.checkJoin(RFC2396_BASE, 'g:h', 'g:h')
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py

index b3494fa23a2fc390033b3a765122c2804cd1c59c..a20a3d4156e1175864740917b153df250c592604 100644 (file)
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -184,12 +184,17 @@ def urlsplit(url, scheme='', allow_fragments=True):
              v = SplitResult(scheme, netloc, url, query, fragment)
              _parse_cache[key] = v
              return v
-        if url.endswith(':') or not url[i+1].isdigit():
-            for c in url[:i]:
-                if c not in scheme_chars:
-                    break
-            else:
+        for c in url[:i]:
+            if c not in scheme_chars:
+                break
+        else:
+            try:
+                # make sure "url" is not actually a port number (in which case
+                # "scheme" is really part of the path
+                _testportnum = int(url[i+1:])
+            except ValueError:
                  scheme, url = url[:i].lower(), url[i+1:]
+
      if url[:2] == '//':
          netloc, url = _splitnetloc(url, 2)
      if allow_fragments and scheme in uses_fragment and '#' in url:
diff --git a/Misc/NEWS b/Misc/NEWS

index 8764b1131f314b9471f611bb7976dec6a67737bc..6e971a0bc1aa4f0709d32d190a52b65a6904be88 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -51,6 +51,9 @@ Core and Builtins
  Library
  -------
  
+- Issue #11467: Fix urlparse behavior when handling urls which contains scheme 
+  specific part only digits. Patch by Santoso Wijaya.
+
  - Issue #11474: Fix the bug with url2pathname() handling of '/C|/' on Windows.
    Patch by Santoso Wijaya.
author	Senthil Kumaran <orsenthil@gmail.com>
	Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)
committer	Senthil Kumaran <orsenthil@gmail.com>
	Fri, 15 Apr 2011 10:20:24 +0000 (18:20 +0800)
Lib/test/test_urlparse.py		patch \| blob \| history
Lib/urllib/parse.py		patch \| blob \| history
Misc/NEWS		patch \| blob \| history