The bulk of the credit for these changes goes to Bastian Kleineidam

author Skip Montanaro <skip@pobox.com>

Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)

committer Skip Montanaro <skip@pobox.com>

Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)
author Skip Montanaro <skip@pobox.com>
Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)
committer Skip Montanaro <skip@pobox.com>
Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)
diff --git a/Lib/robotparser.py b/Lib/robotparser.py

index d627c9af28b4ef3ef93fb0d6f02365153879acf6..ff25dfe49811892df5261047a61d2b71c1c1f4b0 100644 (file)
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -39,28 +39,19 @@ class RobotFileParser:
          self.host, self.path = urlparse.urlparse(url)[1:3]
  
      def read(self):
-        import httplib
-        tries = 0
-        while tries<5:
-            connection = httplib.HTTP(self.host)
-            connection.putrequest("GET", self.path)
-            connection.putheader("Host", self.host)
-            connection.endheaders()
-            status, text, mime = connection.getreply()
-            if status in [301,302] and mime:
-                tries = tries + 1
-                newurl = mime.get("Location", mime.get("Uri", ""))
-                newurl = urlparse.urljoin(self.url, newurl)
-                self.set_url(newurl)
-            else:
-                break
-        if status==401 or status==403:
+        opener = URLopener()
+        f = opener.open(self.url)
+        lines = f.readlines()
+        self.errcode = opener.errcode
+        if self.errcode == 401 or self.errcode == 403:
              self.disallow_all = 1
-        elif status>=400:
+            _debug("disallow all")
+        elif self.errcode >= 400:
              self.allow_all = 1
-        else:
-            # status < 400
-            self.parse(connection.getfile().readlines())
+            _debug("allow all")
+        elif self.errcode == 200 and lines:
+            _debug("parse lines")
+            self.parse(lines)
  
      def parse(self, lines):
          """parse the input lines from a robot.txt file.
@@ -129,15 +120,15 @@ class RobotFileParser:
  
      def can_fetch(self, useragent, url):
          """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
+        _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" %
+               (useragent, url))
          if self.disallow_all:
              return 0
          if self.allow_all:
              return 1
          # search for given user agent matches
          # the first match counts
-        useragent = useragent.lower()
-        url = urllib.quote(urlparse.urlparse(url)[2])
+        url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
          for entry in self.entries:
              if entry.applies_to(useragent):
                  return entry.allowance(url)
@@ -181,11 +172,16 @@ class Entry:
          return ret
  
      def applies_to(self, useragent):
-        "check if this entry applies to the specified agent"
+        """check if this entry applies to the specified agent"""
+        # split the name token and make it lower case
+        useragent = useragent.split("/")[0].lower()
          for agent in self.useragents:
-            if agent=="*":
+            if agent=='*':
+                # we have the catch-all agent
                  return 1
-            if re.match(agent, useragent):
+            agent = agent.lower()
+            # don't forget to re.escape
+            if re.search(re.escape(useragent), agent):
                  return 1
          return 0
  
@@ -194,25 +190,84 @@ class Entry:
          - our agent applies to this entry
          - filename is URL decoded"""
          for line in self.rulelines:
+            _debug((filename, str(line), line.allowance))
              if line.applies_to(filename):
                  return line.allowance
          return 1
  
+class URLopener(urllib.FancyURLopener):
+    def __init__(self, *args):
+        apply(urllib.FancyURLopener.__init__, (self,) + args)
+        self.errcode = 200
+        self.tries = 0
+        self.maxtries = 10
+        
+    def http_error_default(self, url, fp, errcode, errmsg, headers):
+        self.errcode = errcode
+        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
+                                                        errmsg, headers)
+
+    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
+        self.tries += 1
+        if self.tries >= self.maxtries:
+            return self.http_error_default(url, fp, 500,
+                                           "Internal Server Error: Redirect Recursion",
+                                           headers)
+        result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
+                                                      errmsg, headers, data)
+        self.tries = 0
+        return result
+
+def _check(a,b):
+    if not b:
+        ac = "access denied"
+    else:
+        ac = "access allowed"
+    if a!=b:
+        print "failed"
+    else:
+        print "ok (%s)" % ac
+    print
  
  def _test():
      global debug
      import sys
      rp = RobotFileParser()
      debug = 1
-    if len(sys.argv) <= 1:
-        rp.set_url('http://www.musi-cal.com/robots.txt')
-        rp.read()
-    else:
-        rp.parse(open(sys.argv[1]).readlines())
-    print rp.can_fetch('*', 'http://www.musi-cal.com/')
-    print rp.can_fetch('Musi-Cal-Robot/1.0',
+
+    # robots.txt that exists, gotten to by redirection
+    rp.set_url('http://www.musi-cal.com/robots.txt')
+    rp.read()
+
+    # test for re.escape
+    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
+    # this should match the first rule, which is a disallow
+    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
+    # various cherry pickers
+    _check(rp.can_fetch('CherryPickerSE',
+                       'http://www.musi-cal.com/cgi-bin/event-search'
+                       '?city=San+Francisco'), 0)
+    _check(rp.can_fetch('CherryPickerSE/1.0',
                         'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco')
+                       '?city=San+Francisco'), 0)
+    _check(rp.can_fetch('CherryPickerSE/1.5',
+                       'http://www.musi-cal.com/cgi-bin/event-search'
+                       '?city=San+Francisco'), 0)
+    # case sensitivity
+    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
+    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
+    # substring test
+    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
+    # tests for catch-all * agent
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+
+    # robots.txt that does not exist
+    rp.set_url('http://www.lycos.com/robots.txt')
+    rp.read()
+    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
  
  if __name__ == '__main__':
      _test()
author	Skip Montanaro <skip@pobox.com>
	Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)
committer	Skip Montanaro <skip@pobox.com>
	Mon, 12 Feb 2001 20:58:30 +0000 (20:58 +0000)