]> granicus.if.org Git - python/commitdiff
Bug 3347: robotparser failed because it didn't convert bytes to string.
authorJeremy Hylton <jeremy@alum.mit.edu>
Fri, 18 Jul 2008 20:59:44 +0000 (20:59 +0000)
committerJeremy Hylton <jeremy@alum.mit.edu>
Fri, 18 Jul 2008 20:59:44 +0000 (20:59 +0000)
The solution is to convert bytes to text via utf-8.  I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.

Lib/test/test_robotparser.py
Lib/urllib/robotparser.py

index fbb02bcbc2f185eb4c9192634b6b496937a61701..f02f98660441bd6e5463f26aaf75ed369b60e6e2 100644 (file)
@@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
 
 RobotTest(7, doc, good, bad)
 
-class TestCase(unittest.TestCase):
-    def runTest(self):
+class NetworkTestCase(unittest.TestCase):
+
+    def testPasswordProtectedSite(self):
         support.requires('network')
         # whole site is password-protected.
         url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@ class TestCase(unittest.TestCase):
         parser.read()
         self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
 
+    def testPythonOrg(self):
+        support.requires('network')
+        parser = urllib.robotparser.RobotFileParser(
+            "http://www.python.org/robots.txt")
+        parser.read()
+        self.assertTrue(parser.can_fetch("*",
+                                         "http://www.python.org/robots.txt"))
+
 def test_main():
+    support.run_unittest(NetworkTestCase)
     support.run_unittest(tests)
-    TestCase().run()
 
 if __name__=='__main__':
     support.Verbose = 1
index a91df8d815b6aca18ad1a4bc0642d9bb822c41e0..c55fb5082f66edd562c806d9735d6fbd1fc6db3c 100644 (file)
@@ -60,7 +60,8 @@ class RobotFileParser:
             elif err.code >= 400:
                 self.allow_all = True
         else:
-            self.parse(f.read().splitlines())
+            raw = f.read()
+            self.parse(raw.decode("utf-8").splitlines())
 
     def _add_entry(self, entry):
         if "*" in entry.useragents:
@@ -123,7 +124,10 @@ class RobotFileParser:
             return True
         # search for given user agent matches
         # the first match counts
-        url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+        url = urllib.parse.quote(
+            urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+        if not url:
+            url = "/"
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.allowance(url)