From: Jeremy Hylton <jeremy@alum.mit.edu>
Date: Fri, 18 Jul 2008 20:59:44 +0000 (+0000)
Subject: Bug 3347: robotparser failed because it didn't convert bytes to string.
X-Git-Tag: v3.0b3~267
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=73fd46d24e45c34f0fb87261e5471584a7c273df;p=python

Bug 3347: robotparser failed because it didn't convert bytes to string.

The solution is to convert bytes to text via utf-8.  I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
---

diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index fbb02bcbc2..f02f986604 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
 
 RobotTest(7, doc, good, bad)
 
-class TestCase(unittest.TestCase):
-    def runTest(self):
+class NetworkTestCase(unittest.TestCase):
+
+    def testPasswordProtectedSite(self):
         support.requires('network')
         # whole site is password-protected.
         url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@ class TestCase(unittest.TestCase):
         parser.read()
         self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
 
+    def testPythonOrg(self):
+        support.requires('network')
+        parser = urllib.robotparser.RobotFileParser(
+            "http://www.python.org/robots.txt")
+        parser.read()
+        self.assertTrue(parser.can_fetch("*",
+                                         "http://www.python.org/robots.txt"))
+
 def test_main():
+    support.run_unittest(NetworkTestCase)
     support.run_unittest(tests)
-    TestCase().run()
 
 if __name__=='__main__':
     support.Verbose = 1
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index a91df8d815..c55fb5082f 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -60,7 +60,8 @@ class RobotFileParser:
             elif err.code >= 400:
                 self.allow_all = True
         else:
-            self.parse(f.read().splitlines())
+            raw = f.read()
+            self.parse(raw.decode("utf-8").splitlines())
 
     def _add_entry(self, entry):
         if "*" in entry.useragents:
@@ -123,7 +124,10 @@ class RobotFileParser:
             return True
         # search for given user agent matches
         # the first match counts
-        url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+        url = urllib.parse.quote(
+            urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+        if not url:
+            url = "/"
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.allowance(url)