]> granicus.if.org Git - python/commitdiff
Skip Montanaro's robots.txt parser.
authorGuido van Rossum <guido@python.org>
Thu, 30 Jan 1997 03:18:23 +0000 (03:18 +0000)
committerGuido van Rossum <guido@python.org>
Thu, 30 Jan 1997 03:18:23 +0000 (03:18 +0000)
Lib/robotparser.py [new file with mode: 0644]
Tools/webchecker/robotparser.py [new file with mode: 0644]

diff --git a/Lib/robotparser.py b/Lib/robotparser.py
new file mode 100644 (file)
index 0000000..634c3fe
--- /dev/null
@@ -0,0 +1,97 @@
+"""
+
+Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
+input, builds a set of rules from that list, then answers questions about
+fetchability of other URLs.
+
+"""
+
+class RobotFileParser:
+
+    def __init__(self):
+       self.rules = {}
+       self.debug = 0
+       self.url = ''
+       self.last_checked = 0
+
+    def mtime(self):
+       return self.last_checked
+
+    def modified(self):
+       import time
+       self.last_checked = time.time()
+
+    def set_url(self, url):
+       self.url = url
+##     import urlmisc
+##     self.url = urlmisc.canonical_url(url)
+
+    def read(self):
+       import urllib
+       self.parse(urllib.urlopen(self.url).readlines())
+
+    def parse(self, lines):
+       import regsub, string, regex
+       active = []
+       for line in lines:
+           if self.debug: print '>', line,
+           # blank line terminates current record
+           if not line[:-1]:
+               active = []
+               continue
+           # remove optional comment and strip line
+           line = string.strip(line[:string.find(line, '#')])
+           if not line:
+               continue
+           line = regsub.split(line, ' *: *')
+           if len(line) == 2:
+               line[0] = string.lower(line[0])
+               if line[0] == 'user-agent':
+                   # this record applies to this user agent
+                   if self.debug: print '>> user-agent:', line[1]
+                   active.append(line[1])
+                   if not self.rules.has_key(line[1]):
+                       self.rules[line[1]] = []
+               elif line[0] == 'disallow':
+                   if line[1]:
+                       if self.debug: print '>> disallow:', line[1]
+                       for agent in active:
+                           self.rules[agent].append(regex.compile(line[1]))
+                   else:
+                       pass
+                       for agent in active:
+                           if self.debug: print '>> allow', agent
+                           self.rules[agent] = []
+               else:
+                   if self.debug: print '>> unknown:', line
+
+       self.modified()
+
+    # returns true if agent is allowed to fetch url
+    def can_fetch(self, agent, url):
+       import urlparse
+       ag = agent
+       if not self.rules.has_key(ag): ag = '*'
+       if not self.rules.has_key(ag):
+           if self.debug: print '>> allowing', url, 'fetch by', agent
+           return 1
+       path = urlparse.urlparse(url)[2]
+       for rule in self.rules[ag]:
+           if rule.match(path) != -1:
+               if self.debug: print '>> disallowing', url, 'fetch by', agent
+               return 0
+       if self.debug: print '>> allowing', url, 'fetch by', agent
+       return 1
+
+def test():
+    rp = RobotFileParser()
+    rp.debug = 1
+    rp.set_url('http://www.automatrix.com/robots.txt')
+    rp.read()
+    print rp.rules
+    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
+    print rp.can_fetch('Musi-Cal-Robot',
+                      'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
+
+    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
+    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py
new file mode 100644 (file)
index 0000000..634c3fe
--- /dev/null
@@ -0,0 +1,97 @@
+"""
+
+Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
+input, builds a set of rules from that list, then answers questions about
+fetchability of other URLs.
+
+"""
+
+class RobotFileParser:
+
+    def __init__(self):
+       self.rules = {}
+       self.debug = 0
+       self.url = ''
+       self.last_checked = 0
+
+    def mtime(self):
+       return self.last_checked
+
+    def modified(self):
+       import time
+       self.last_checked = time.time()
+
+    def set_url(self, url):
+       self.url = url
+##     import urlmisc
+##     self.url = urlmisc.canonical_url(url)
+
+    def read(self):
+       import urllib
+       self.parse(urllib.urlopen(self.url).readlines())
+
+    def parse(self, lines):
+       import regsub, string, regex
+       active = []
+       for line in lines:
+           if self.debug: print '>', line,
+           # blank line terminates current record
+           if not line[:-1]:
+               active = []
+               continue
+           # remove optional comment and strip line
+           line = string.strip(line[:string.find(line, '#')])
+           if not line:
+               continue
+           line = regsub.split(line, ' *: *')
+           if len(line) == 2:
+               line[0] = string.lower(line[0])
+               if line[0] == 'user-agent':
+                   # this record applies to this user agent
+                   if self.debug: print '>> user-agent:', line[1]
+                   active.append(line[1])
+                   if not self.rules.has_key(line[1]):
+                       self.rules[line[1]] = []
+               elif line[0] == 'disallow':
+                   if line[1]:
+                       if self.debug: print '>> disallow:', line[1]
+                       for agent in active:
+                           self.rules[agent].append(regex.compile(line[1]))
+                   else:
+                       pass
+                       for agent in active:
+                           if self.debug: print '>> allow', agent
+                           self.rules[agent] = []
+               else:
+                   if self.debug: print '>> unknown:', line
+
+       self.modified()
+
+    # returns true if agent is allowed to fetch url
+    def can_fetch(self, agent, url):
+       import urlparse
+       ag = agent
+       if not self.rules.has_key(ag): ag = '*'
+       if not self.rules.has_key(ag):
+           if self.debug: print '>> allowing', url, 'fetch by', agent
+           return 1
+       path = urlparse.urlparse(url)[2]
+       for rule in self.rules[ag]:
+           if rule.match(path) != -1:
+               if self.debug: print '>> disallowing', url, 'fetch by', agent
+               return 0
+       if self.debug: print '>> allowing', url, 'fetch by', agent
+       return 1
+
+def test():
+    rp = RobotFileParser()
+    rp.debug = 1
+    rp.set_url('http://www.automatrix.com/robots.txt')
+    rp.read()
+    print rp.rules
+    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
+    print rp.can_fetch('Musi-Cal-Robot',
+                      'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
+
+    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
+    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')