Get rid of _test(), _main(), _debug() and _check(). Tests are no longer

author Skip Montanaro <skip@pobox.com>

Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)

committer Skip Montanaro <skip@pobox.com>

Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)
author Skip Montanaro <skip@pobox.com>
Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)
committer Skip Montanaro <skip@pobox.com>
Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)
diff --git a/Lib/robotparser.py b/Lib/robotparser.py

index 52ab348be33d82ea4105618b62f70555377bbe1b..5b1d797fe86ea682e7a59586a026e7210d7c1b40 100644 (file)
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,15 +9,11 @@
      The robots.txt Exclusion Protocol is implemented as specified in
      http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  """
-import urlparse,urllib
+import urlparse
+import urllib
  
  __all__ = ["RobotFileParser"]
  
-debug = 0
-
-def _debug(msg):
-    if debug: print msg
-
  
  class RobotFileParser:
      """ This class provides a set of methods to read, parse and answer
@@ -67,12 +63,9 @@ class RobotFileParser:
          self.errcode = opener.errcode
          if self.errcode in (401, 403):
              self.disallow_all = True
-            _debug("disallow all")
          elif self.errcode >= 400:
              self.allow_all = True
-            _debug("allow all")
          elif self.errcode == 200 and lines:
-            _debug("parse lines")
              self.parse(lines)
  
      def _add_entry(self, entry):
@@ -93,19 +86,16 @@ class RobotFileParser:
          for line in lines:
              linenumber = linenumber + 1
              if not line:
-                if state==1:
-                    _debug("line %d: warning: you should insert"
-                           " allow: or disallow: directives below any"
-                           " user-agent: line" % linenumber)
+                if state == 1:
                      entry = Entry()
                      state = 0
-                elif state==2:
+                elif state == 2:
                      self._add_entry(entry)
                      entry = Entry()
                      state = 0
              # remove optional comment and strip line
              i = line.find('#')
-            if i>=0:
+            if i >= 0:
                  line = line[:i]
              line = line.strip()
              if not line:
@@ -115,41 +105,24 @@ class RobotFileParser:
                  line[0] = line[0].strip().lower()
                  line[1] = urllib.unquote(line[1].strip())
                  if line[0] == "user-agent":
-                    if state==2:
-                        _debug("line %d: warning: you should insert a blank"
-                               " line before any user-agent"
-                               " directive" % linenumber)
+                    if state == 2:
                          self._add_entry(entry)
                          entry = Entry()
                      entry.useragents.append(line[1])
                      state = 1
                  elif line[0] == "disallow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                          entry.rulelines.append(RuleLine(line[1], False))
                          state = 2
                  elif line[0] == "allow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                          entry.rulelines.append(RuleLine(line[1], True))
-                else:
-                    _debug("line %d: warning: unknown key %s" % (linenumber,
-                               line[0]))
-            else:
-                _debug("line %d: error: malformed line %s"%(linenumber, line))
-        if state==2:
+        if state == 2:
              self.entries.append(entry)
-        _debug("Parsed rules:\n%s" % str(self))
  
  
      def can_fetch(self, useragent, url):
          """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
-               (useragent, url))
          if self.disallow_all:
              return False
          if self.allow_all:
@@ -182,10 +155,10 @@ class RuleLine:
          self.allowance = allowance
  
      def applies_to(self, filename):
-        return self.path=="*" or filename.startswith(self.path)
+        return self.path == "*" or filename.startswith(self.path)
  
      def __str__(self):
-        return (self.allowance and "Allow" or "Disallow")+": "+self.path
+        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
  
  
  class Entry:
@@ -207,7 +180,7 @@ class Entry:
          # split the name token and make it lower case
          useragent = useragent.split("/")[0].lower()
          for agent in self.useragents:
-            if agent=='*':
+            if agent == '*':
                  # we have the catch-all agent
                  return True
              agent = agent.lower()
@@ -220,7 +193,6 @@ class Entry:
          - our agent applies to this entry
          - filename is URL decoded"""
          for line in self.rulelines:
-            _debug((filename, str(line), line.allowance))
              if line.applies_to(filename):
                  return line.allowance
          return True
@@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener):
          self.errcode = errcode
          return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                          errmsg, headers)
-
-def _check(a,b):
-    if not b:
-        ac = "access denied"
-    else:
-        ac = "access allowed"
-    if a!=b:
-        print "failed"
-    else:
-        print "ok (%s)" % ac
-    print
-
-def _test():
-    global debug
-    rp = RobotFileParser()
-    debug = 1
-
-    # robots.txt that exists, gotten to by redirection
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-
-    # test for re.escape
-    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
-    # this should match the first rule, which is a disallow
-    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
-    # various cherry pickers
-    _check(rp.can_fetch('CherryPickerSE',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.0',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.5',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    # case sensitivity
-    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
-    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
-    # substring test
-    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
-    # tests for catch-all * agent
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
-    # robots.txt that does not exist
-    rp.set_url('http://www.lycos.com/robots.txt')
-    rp.read()
-    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
-    _test()
author	Skip Montanaro <skip@pobox.com>
	Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)
committer	Skip Montanaro <skip@pobox.com>
	Mon, 28 Apr 2008 03:27:53 +0000 (03:27 +0000)