]> granicus.if.org Git - python/commitdiff
Moved robotparser.py to the Lib directory.
authorGuido van Rossum <guido@python.org>
Wed, 29 Mar 2000 16:02:45 +0000 (16:02 +0000)
committerGuido van Rossum <guido@python.org>
Wed, 29 Mar 2000 16:02:45 +0000 (16:02 +0000)
If you do a "cvs update" in the Lib directory, it will pop up there.

Tools/webchecker/robotparser.py [deleted file]

diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py
deleted file mode 100644 (file)
index 3f4396b..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-
-Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
-input, builds a set of rules from that list, then answers questions about
-fetchability of other URLs.
-
-"""
-
-class RobotFileParser:
-
-    def __init__(self):
-        self.rules = {}
-        self.debug = 0
-        self.url = ''
-        self.last_checked = 0
-
-    def mtime(self):
-        return self.last_checked
-
-    def modified(self):
-        import time
-        self.last_checked = time.time()
-
-    def set_url(self, url):
-        self.url = url
-
-    def read(self):
-        import urllib
-        self.parse(urllib.urlopen(self.url).readlines())
-
-    def parse(self, lines):
-        """parse the input lines from a robot.txt file"""
-        import string, re
-        active = []
-        for line in lines:
-            if self.debug: print '>', line,
-            # blank line terminates current record
-            if not line[:-1]:
-                active = []
-                continue
-            # remove optional comment and strip line
-            line = string.strip(line[:string.find(line, '#')])
-            if not line:
-                continue
-            line = re.split(' *: *', line)
-            if len(line) == 2:
-                line[0] = string.lower(line[0])
-                if line[0] == 'user-agent':
-                    # this record applies to this user agent
-                    if self.debug: print '>> user-agent:', line[1]
-                    active.append(line[1])
-                    if not self.rules.has_key(line[1]):
-                        self.rules[line[1]] = []
-                elif line[0] == 'disallow':
-                    if line[1]:
-                        if self.debug: print '>> disallow:', line[1]
-                        for agent in active:
-                            self.rules[agent].append(re.compile(line[1]))
-                    else:
-                        pass
-                        for agent in active:
-                            if self.debug: print '>> allow', agent
-                            self.rules[agent] = []
-                else:
-                    if self.debug: print '>> unknown:', line
-
-        self.modified()
-
-    # returns true if agent is allowed to fetch url
-    def can_fetch(self, useragent, url):
-        """using the parsed robots.txt decide if useragent can fetch url"""
-        import urlparse
-        ag = useragent
-        if not self.rules.has_key(ag): ag = '*'
-        if not self.rules.has_key(ag):
-            if self.debug: print '>> allowing', url, 'fetch by', useragent
-            return 1
-        path = urlparse.urlparse(url)[2]
-        for rule in self.rules[ag]:
-            if rule.match(path) is not None:
-                if self.debug: print '>> disallowing', url, 'fetch by', useragent
-                return 0
-        if self.debug: print '>> allowing', url, 'fetch by', useragent
-        return 1
-
-def _test():
-    rp = RobotFileParser()
-    rp.debug = 1
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-    print rp.rules
-    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
-    print rp.can_fetch('Musi-Cal-Robot',
-                       'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
-
-if __name__ == "__main__":
-    _test()