]> granicus.if.org Git - python/commitdiff
Several changes:
authorGuido van Rossum <guido@python.org>
Mon, 6 Oct 1997 18:54:01 +0000 (18:54 +0000)
committerGuido van Rossum <guido@python.org>
Mon, 6 Oct 1997 18:54:01 +0000 (18:54 +0000)
- Change the code that looks for robots.txt to always look in /, even
if the "root" path is somewhere deep down below.

- Add link processing in <AREA> tags.

- Change safeclose() to avoid crashing when the file has no geturl()
method.

Tools/webchecker/webchecker.py

index dba641c848fa9bd0ce7613ee175ce387935b5695..f4120117b26d1534c4741d9a329f7a8a63b8f6af 100755 (executable)
@@ -251,11 +251,21 @@ class Checker:
 
     def addroot(self, root):
        if root not in self.roots:
-           self.roots.append(root)
+           troot = root
+           scheme, netloc, path, params, query, fragment = \
+                   urlparse.urlparse(root)
+           i = string.rfind(path, "/") + 1
+           if 0 < i < len(path):
+               path = path[:i]
+               troot = urlparse.urlunparse((scheme, netloc, path,
+                                            params, query, fragment))
+           self.roots.append(troot)
            self.addrobot(root)
            self.newlink(root, ("<root>", root))
 
     def addrobot(self, root):
+       root = urlparse.urljoin(root, "/")
+       if self.robots.has_key(root): return
        url = urlparse.urljoin(root, "/robots.txt")
        self.robots[root] = rp = robotparser.RobotFileParser()
        if verbose > 2:
@@ -357,6 +367,7 @@ class Checker:
     def inroots(self, url):
        for root in self.roots:
            if url[:len(root)] == root:
+               root = urlparse.urljoin(root, "/")
                return self.robots[root].can_fetch(AGENTNAME, url)
        return 0
 
@@ -528,6 +539,9 @@ class MyHTMLParser(sgmllib.SGMLParser):
 
     def end_a(self): pass
 
+    def do_area(self, attributes):
+       self.link_attr(attributes, 'href')
+
     def do_img(self, attributes):
        self.link_attr(attributes, 'src', 'lowsrc')
 
@@ -580,11 +594,15 @@ def sanitize(msg):
 
 
 def safeclose(f):
-    url = f.geturl()
-    if url[:4] == 'ftp:' or url[:7] == 'file://':
-       # Apparently ftp connections don't like to be closed
-       # prematurely...
-       text = f.read()
+    try:
+       url = f.geturl()
+    except AttributeError:
+       pass
+    else:
+       if url[:4] == 'ftp:' or url[:7] == 'file://':
+           # Apparently ftp connections don't like to be closed
+           # prematurely...
+           text = f.read()
     f.close()