]> granicus.if.org Git - python/commitdiff
Fix urllib2.urlopen() handling of chunked content encoding.
authorJeremy Hylton <jeremy@alum.mit.edu>
Sat, 7 Aug 2004 17:40:50 +0000 (17:40 +0000)
committerJeremy Hylton <jeremy@alum.mit.edu>
Sat, 7 Aug 2004 17:40:50 +0000 (17:40 +0000)
The change to use the newer httplib interface admitted the possibility
that we'd get an HTTP/1.1 chunked response, but the code didn't handle
it correctly.  The raw socket object can't be pass to addinfourl(),
because it would read the undecoded response.  Instead, addinfourl()
must call HTTPResponse.read(), which will handle the decoding.

One extra wrinkle is that the HTTPReponse object can't be passed to
addinfourl() either, because it doesn't implement readline() or
readlines().  As a quick hack, use socket._fileobject(), which
implements those methods on top of a read buffer.  (suggested by mwh)

Finally, add some tests based on test_urllibnet.

Thanks to Andrew Sawyers for originally reporting the chunked problem.

Lib/test/test_urllib2.py
Lib/test/test_urllib2net.py [new file with mode: 0644]
Lib/urllib2.py

index 6e9901f5699f8991f43fc98a87bd001837cd0c44..c68d244a6c8693baf5e77cd3ad11c342e5c1e067 100644 (file)
@@ -423,6 +423,8 @@ class HandlerTests(unittest.TestCase):
                 self.msg = msg
                 self.status = status
                 self.reason = reason
+            def read(self):
+                return ''
         class MockHTTPClass:
             def __init__(self):
                 self.req_headers = []
diff --git a/Lib/test/test_urllib2net.py b/Lib/test/test_urllib2net.py
new file mode 100644 (file)
index 0000000..3c23246
--- /dev/null
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+import unittest
+from test import test_support
+
+import socket
+import urllib2
+import sys
+import os
+import mimetools
+
+class URLTimeoutTest(unittest.TestCase):
+
+    TIMEOUT = 10.0
+
+    def setUp(self):
+        socket.setdefaulttimeout(self.TIMEOUT)
+
+    def tearDown(self):
+        socket.setdefaulttimeout(None)
+
+    def testURLread(self):
+        f = urllib2.urlopen("http://www.python.org/")
+        x = f.read()
+
+class urlopenNetworkTests(unittest.TestCase):
+    """Tests urllib2.urlopen using the network.
+
+    These tests are not exhaustive.  Assuming that testing using files does a
+    good job overall of some of the basic interface features.  There are no
+    tests exercising the optional 'data' and 'proxies' arguments.  No tests
+    for transparent redirection have been written.
+
+    setUp is not used for always constructing a connection to
+    http://www.python.org/ since there a few tests that don't use that address
+    and making a connection is expensive enough to warrant minimizing unneeded
+    connections.
+
+    """
+
+    def test_basic(self):
+        # Simple test expected to pass.
+        open_url = urllib2.urlopen("http://www.python.org/")
+        for attr in ("read", "close", "info", "geturl"):
+            self.assert_(hasattr(open_url, attr), "object returned from "
+                            "urlopen lacks the %s attribute" % attr)
+        try:
+            self.assert_(open_url.read(), "calling 'read' failed")
+        finally:
+            open_url.close()
+
+    def test_info(self):
+        # Test 'info'.
+        open_url = urllib2.urlopen("http://www.python.org/")
+        try:
+            info_obj = open_url.info()
+        finally:
+            open_url.close()
+            self.assert_(isinstance(info_obj, mimetools.Message),
+                         "object returned by 'info' is not an instance of "
+                         "mimetools.Message")
+            self.assertEqual(info_obj.getsubtype(), "html")
+
+    def test_geturl(self):
+        # Make sure same URL as opened is returned by geturl.
+        URL = "http://www.python.org/"
+        open_url = urllib2.urlopen(URL)
+        try:
+            gotten_url = open_url.geturl()
+        finally:
+            open_url.close()
+        self.assertEqual(gotten_url, URL)
+
+    def test_bad_address(self):
+        # Make sure proper exception is raised when connecting to a bogus
+        # address.
+        self.assertRaises(IOError,
+                          # SF patch 809915:  In Sep 2003, VeriSign started
+                          # highjacking invalid .com and .net addresses to
+                          # boost traffic to their own site.  This test
+                          # started failing then.  One hopes the .invalid
+                          # domain will be spared to serve its defined
+                          # purpose.
+                          # urllib2.urlopen, "http://www.sadflkjsasadf.com/")
+                          urllib2.urlopen, "http://www.python.invalid/")
+
+def test_main():
+    test_support.requires("network")
+    test_support.run_unittest(URLTimeoutTest, urlopenNetworkTests)
+
+if __name__ == "__main__":
+    test_main()
index c525f8ca2324b7ac5add8ecccacdc05fa9315b39..9ec8b9b4967e5e5793f1c72e8d7f1bb9f07d369d 100644 (file)
@@ -997,8 +997,20 @@ class AbstractHTTPHandler(BaseHandler):
             raise URLError(err)
 
         # Pick apart the HTTPResponse object to get the addinfourl
-        # object initialized properly
-        resp = addinfourl(r.fp, r.msg, req.get_full_url())
+        # object initialized properly.
+
+        # Wrap the HTTPResponse object in socket's file object adapter
+        # for Windows.  That adapter calls recv(), so delegate recv()
+        # to read().  This weird wrapping allows the returned object to
+        # have readline() and readlines() methods.
+        
+        # XXX It might be better to extract the read buffering code
+        # out of socket._fileobject() and into a base class.
+        
+        r.recv = r.read
+        fp = socket._fileobject(r)
+        
+        resp = addinfourl(fp, r.msg, req.get_full_url())
         resp.code = r.status
         resp.msg = r.reason
         return resp