web tree checker

author Guido van Rossum <guido@python.org>

Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)

committer Guido van Rossum <guido@python.org>

Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)
author Guido van Rossum <guido@python.org>
Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)
committer Guido van Rossum <guido@python.org>
Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py

new file mode 100755 (executable)

index 0000000..255c490
--- /dev/null
+++ b/Tools/webchecker/webchecker.py
@@ -0,0 +1,488 @@
+#! /usr/bin/env python
+
+"""Web tree checker.
+
+This utility is handy to check a subweb of the world-wide web for
+errors.  A subweb is specified by giving one or more ``root URLs''; a
+page belongs to the subweb if one of the root URLs is an initial
+prefix of it.
+
+File URL extension:
+
+In order to easy the checking of subwebs via the local file system,
+the interpretation of ``file:'' URLs is extended to mimic the behavior
+of your average HTTP daemon: if a directory pathname is given, the
+file index.html in that directory is returned if it exists, otherwise
+a directory listing is returned.  Now, you can point webchecker to the
+document tree in the local file system of your HTTP daemon, and have
+most of it checked.  In fact the default works this way if your local
+web tree is located at /usr/local/etc/httpd/htdpcs (the default for
+the NCSA HTTP daemon and probably others).
+
+Reports printed:
+
+When done, it reports links to pages outside the web (unless -q is
+specified), and pages with bad links within the subweb.  When
+interrupted, it print those same reports for the pages that it has
+checked already.
+
+In verbose mode, additional messages are printed during the
+information gathering phase.  By default, it prints a summary of its
+work status every 50 URLs (adjustable with the -r option), and it
+reports errors as they are encountered.  Use the -q option to disable
+this output.
+
+Checkpoint feature:
+
+Whether interrupted or not, it dumps its state (a Python pickle) to a
+checkpoint file and the -R option allows it to restart from the
+checkpoint (assuming that the pages on the subweb that were already
+processed haven't changed).  Even when it has run till completion, -R
+can still be useful -- it will print the reports again, and -Rq prints
+the errors only.  In this case, the checkpoint file is not written
+again.  The checkpoint file can be set with the -d option.
+
+The checkpoint file is written as a Python pickle.  Remember that
+Python's pickle module is currently quite slow.  Give it the time it
+needs to load and save the checkpoint file.  When interrupted while
+writing the checkpoint file, the old checkpoint file is not
+overwritten, but all work done in the current run is lost.
+
+Miscellaneous:
+
+- Because the HTML parser is a bit slow, very large HTML files are
+  skipped.  The size limit can be set with the -m option.
+
+- Before fetching a page, it guesses its type based on its extension.
+If it is a known extension and the type is not text/http, the page is
+not fetched.  This is a huge optimization but occasionally it means
+links can be missed.  The mimetypes.py module (also in this directory)
+has a built-in table mapping most currently known suffixes, and in
+addition attempts to read the mime.types configuration files in the
+default locations of Netscape and the NCSA HTTP daemon.
+
+- It only follows links indicated by <A> tags.  It doesn't follow
+links in <FORM> or <IMG> or whatever other tags might contain
+hyperlinks.  It does honor the <BASE> tag.
+
+- It could be argued that it should also check external links for
+validity.  This is true, but is is more error-prone.  I think I will
+make this an option in the future.
+
+
+Usage: webchecker.py [option] ... [rooturl] ...
+
+Options:
+
+-R        -- restart from checkpoint file
+-d file   -- checkpoint filename (default %(DUMPFILE)s)
+-m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
+-q        -- quiet operation (also suppresses external links report)
+-r number -- number of links processed per round (default %(ROUNDSIZE)d)
+-v        -- verbose operation; repeating -v will increase verbosity
+
+Arguments:
+
+rooturl   -- URL to start checking
+             (default %(DEFROOT)s)
+
+"""
+
+
+import sys
+import os
+from types import *
+import string
+import StringIO
+import getopt
+import pickle
+
+import urllib
+import urlparse
+import htmllib
+import formatter
+
+import mimetypes
+
+
+# Tunable parameters
+DEFROOT = "file:/usr/local/etc/httpd/htdocs/"  # Default root URL
+MAXPAGE = 50000                                # Ignore files bigger than this
+ROUNDSIZE = 50                         # Number of links processed per round
+DUMPFILE = "@webchecker.pickle"                # Pickled checkpoint
+
+
+# Global variables
+verbose = 1
+maxpage = MAXPAGE
+roundsize = ROUNDSIZE
+
+
+def main():
+    global verbose, maxpage, roundsize
+    dumpfile = DUMPFILE
+    restart = 0
+
+    try:
+       opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
+    except getopt.error, msg:
+       sys.stdout = sys.stderr
+       print msg
+       print __doc__ % globals()
+       sys.exit(2)
+    for o, a in opts:
+       if o == '-R':
+           restart = 1
+       if o == '-d':
+           dumpfile = a
+       if o == '-m':
+           maxpage = string.atoi(a)
+       if o == '-q':
+           verbose = 0
+       if o == '-r':
+           roundsize = string.atoi(a)
+       if o == '-v':
+           verbose = verbose + 1
+
+    if restart:
+       if verbose > 0:
+           print "Loading checkpoint from %s ..." % dumpfile
+       f = open(dumpfile, "rb")
+       c = pickle.load(f)
+       f.close()
+       if verbose > 0:
+           print "Done."
+           print "Root:", string.join(c.roots, "\n      ")
+    else:
+       c = Checker()
+       if not args:
+           args.append(DEFROOT)
+
+    for arg in args:
+       c.addroot(arg)
+
+    if not c.todo:
+       needsave = 0
+    else:
+       needsave = 1
+    try:
+       c.run()
+    except KeyboardInterrupt:
+       if verbose > 0:
+           print "[interrupted]"
+    c.report()
+    if not needsave:
+       if verbose > 0:
+           print
+           print "No need to save checkpoint"
+    elif dumpfile:
+       if verbose > 0:
+           print
+           print "Saving checkpoint to %s ..." % dumpfile
+       newfile = dumpfile + ".new"
+       f = open(newfile, "wb")
+       pickle.dump(c, f)
+       f.flush()
+       f.close()
+       try:
+           os.unlink(dumpfile)
+       except os.error:
+           pass
+       os.rename(newfile, dumpfile)
+       if verbose > 0:
+           print "Done."
+           if dumpfile == DUMPFILE:
+               print "Use ``%s -R'' to restart." % sys.argv[0]
+           else:
+               print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
+                                                          dumpfile)
+
+
+class Checker:
+
+    def __init__(self):
+       self.roots = []
+       self.todo = {}
+       self.done = {}
+       self.ext = {}
+       self.bad = {}
+       self.urlopener = MyURLopener()
+       self.round = 0
+
+    def addroot(self, root):
+       if root not in self.roots:
+           self.roots.append(root)
+           self.todo[root] = []
+
+    def run(self):
+       while self.todo:
+           self.round = self.round + 1
+           if verbose > 0:
+               print
+               print "Round", self.round,
+               print "(%d to do, %d done, %d external, %d bad)" % (
+                   len(self.todo), len(self.done),
+                   len(self.ext), len(self.bad))
+               print
+           urls = self.todo.keys()[:roundsize]
+           for url in urls:
+               self.dopage(url)
+               self.done[url] = self.todo[url]
+               del self.todo[url]
+
+    def report(self):
+       print
+       if not self.todo: print "Final",
+       else: print "Interim",
+       print "Report (%d to do, %d done, %d external, %d bad)" % (
+           len(self.todo), len(self.done),
+           len(self.ext), len(self.bad))
+       if verbose > 0:
+           self.report_extrefs()
+       # Report errors last because the output may get truncated
+       self.report_errors()
+
+    def report_extrefs(self):
+       if not self.ext:
+           print
+           print "No external URLs"
+           return
+       print
+       print "External URLs:"
+       print
+       urls = self.ext.keys()
+       urls.sort()
+       for url in urls:
+           show("HREF ", url, " from", self.ext[url])
+
+    def report_errors(self):
+       if not self.bad:
+           print
+           print "No errors"
+           return
+       print
+       print "Error Report:"
+       urls = self.bad.keys()
+       urls.sort()
+       bysource = {}
+       for url in urls:
+           try:
+               origins = self.done[url]
+           except KeyError:
+               origins = self.todo[url]
+           for source, rawlink in origins:
+               triple = url, rawlink, self.bad[url]
+               try:
+                   bysource[source].append(triple)
+               except KeyError:
+                   bysource[source] = [triple]
+       sources = bysource.keys()
+       sources.sort()
+       for source in sources:
+           triples = bysource[source]
+           print
+           if len(triples) > 1:
+               print len(triples), "Errors in", source
+           else:
+               print "Error in", source
+           for url, rawlink, msg in triples:
+               print "  HREF", url,
+               if rawlink != url: print "(%s)" % rawlink,
+               print
+               print "   msg", msg
+
+    def dopage(self, url):
+       if verbose > 1:
+           if verbose > 2:
+               show("Page  ", url, "  from", self.todo[url])
+           else:
+               print "Page  ", url
+       page = self.getpage(url)
+       if not page:
+           return
+       for info in page.getlinkinfos():
+           link, rawlink = info
+           origin = url, rawlink
+           if not self.inroots(link):
+               try:
+                   self.ext[link].append(origin)
+                   if verbose > 3:
+                       print "  New ext link", link,
+                       if link != rawlink: print "(%s)" % rawlink,
+                       print
+               except KeyError:
+                   if verbose > 3:
+                       print "  Seen ext link", link,
+                       if link != rawlink: print "(%s)" % rawlink,
+                       print
+                   self.ext[link] = [origin]
+           elif self.done.has_key(link):
+               if verbose > 3:
+                   print "  Done link", link
+               self.done[link].append(origin)
+           elif self.todo.has_key(link):
+               if verbose > 3:
+                   print "  Seen todo link", link
+               self.todo[link].append(origin)
+           else:
+               if verbose > 3:
+                   print "  New todo link", link
+               self.todo[link] = [origin]
+
+    def inroots(self, url):
+       for root in self.roots:
+           if url[:len(root)] == root:
+               return 1
+       return 0
+
+    def getpage(self, url):
+       ctype, encoding = mimetypes.guess_type(url)
+       if encoding:
+           if verbose > 2:
+               print "  Won't bother, URL suggests encoding %s" % `encoding`
+           return None
+       if ctype and ctype != 'text/html':
+           if verbose > 2:
+               print "  Won't bother, URL suggests mime type %s" % `ctype`
+           return None
+       try:
+           f = self.urlopener.open(url)
+       except IOError, msg:
+           if verbose > 0:
+               print "Error ", msg
+           if verbose > 0:
+               show(" HREF ", url, "  from", self.todo[url])
+           self.bad[url] = msg
+           return None
+       nurl = f.geturl()
+       info = f.info()
+       if info.has_key('content-type'):
+           ctype = string.lower(info['content-type'])
+       if nurl != url:
+           if verbose > 1:
+               print "Redirected to", nurl
+           if not ctype:
+               ctype, encoding = mimetypes.guess_type(nurl)
+       if ctype != 'text/html':
+           f.close()
+           if verbose > 2:
+               print "  Not HTML, mime type", ctype
+           return None
+       text = f.read()
+       f.close()
+       return Page(text, nurl)
+
+
+class Page:
+
+    def __init__(self, text, url):
+       self.text = text
+       self.url = url
+
+    def getlinkinfos(self):
+       size = len(self.text)
+       if size > maxpage:
+           if verbose > 0:
+               print "Skip huge file", self.url
+               print "  (%.0f Kbytes)" % (size*0.001)
+           return []
+       if verbose > 2:
+           print "  Parsing", self.url, "(%d bytes)" % size
+       parser = MyHTMLParser(formatter.NullFormatter())
+       parser.feed(self.text)
+       parser.close()
+       rawlinks = parser.getlinks()
+       base = urlparse.urljoin(self.url, parser.getbase() or "")
+       infos = []
+       for rawlink in rawlinks:
+           t = urlparse.urlparse(rawlink)
+           t = t[:-1] + ('',)
+           rawlink = urlparse.urlunparse(t)
+           link = urlparse.urljoin(base, rawlink)
+           infos.append((link, rawlink))
+       return infos
+
+
+class MyStringIO(StringIO.StringIO):
+
+    def __init__(self, url, info):
+       self.__url = url
+       self.__info = info
+       StringIO.StringIO.__init__(self)
+
+    def info(self):
+       return self.__info
+
+    def geturl(self):
+       return self.__url
+
+
+class MyURLopener(urllib.FancyURLopener):
+
+    http_error_default = urllib.URLopener.http_error_default
+
+    def open_file(self, url):
+       path = urllib.url2pathname(urllib.unquote(url))
+       if path[-1] != os.sep:
+           url = url + '/'
+       if os.path.isdir(path):
+           indexpath = os.path.join(path, "index.html")
+           if os.path.exists(indexpath):
+               return self.open_file(url + "index.html")
+           try:
+               names = os.listdir(path)
+           except os.error, msg:
+               raise IOError, msg, sys.exc_traceback
+           names.sort()
+           s = MyStringIO("file:"+url, {'content-type': 'text/html'})
+           s.write('<BASE HREF="file:%s">\n' %
+                   urllib.quote(os.path.join(path, "")))
+           for name in names:
+               q = urllib.quote(name)
+               s.write('<A HREF="%s">%s</A>\n' % (q, q))
+           s.seek(0)
+           return s
+       return urllib.FancyURLopener.open_file(self, path)
+
+
+class MyHTMLParser(htmllib.HTMLParser):
+
+    def __init__(*args):
+       self = args[0]
+       self.base = None
+       self.links = []
+       apply(htmllib.HTMLParser.__init__, args)
+
+    def start_a(self, attributes):
+       for name, value in attributes:
+           if name == 'href' and value and value not in self.links:
+               self.links.append(string.strip(value))
+
+    def do_base(self, attributes):
+       for name, value in attributes:
+           if name == 'href' and value:
+               if verbose > 1:
+                   print "  Base", value
+               self.base = value
+
+    def getlinks(self):
+       return self.links
+
+    def getbase(self):
+       return self.base
+
+
+def show(p1, link, p2, origins):
+    print p1, link
+    i = 0
+    for source, rawlink in origins:
+       i = i+1
+       if i == 2:
+           p2 = ' '*len(p2)
+       print p2, source,
+       if rawlink != link: print "(%s)" % rawlink,
+       print
+
+
+if __name__ == '__main__':
+    main()
author	Guido van Rossum <guido@python.org>
	Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)
committer	Guido van Rossum <guido@python.org>
	Thu, 30 Jan 1997 02:44:48 +0000 (02:44 +0000)