#!/usr/bin/env python import os, sys, re, datetime, argparse, traceback, tempfile # boilerplate basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) sys.path = [ basedir ] + sys.path os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ietf.settings") import django.test from django.conf import settings # prevent memory from leaking when settings.DEBUG=True from django.db import connection class DontSaveQueries(object): def append(self, x): pass connection.queries = DontSaveQueries() MAX_URL_LENGTH = 500 # args parser = argparse.ArgumentParser( description="""Perform a test crawl of the project. For each found URL, the HTTP response status is printed. If it's not OK/redirect, FAIL is printed - in case of errors, a stacktrace is also included.""") parser.add_argument('urls', metavar='URL', nargs='*', help='One or more URLs to start the crawl from') parser.add_argument('--urls', '-u', dest='url_file', help='file with URLs to start the crawl from') parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0, help='responses taking longer than this (in seconds) results in SLOW being printed') args = parser.parse_args() slow_threshold = args.slow_threshold initial_urls = [] initial_urls.extend(args.urls) if args.url_file: with open(args.url_file) as f: for line in f: line = line.partition("#")[0].strip() if line: initial_urls.append(line) if not initial_urls: initial_urls.append("/") visited = set() urls = {} # url -> referrer def strip_url(url): if url.startswith("http://testserver"): url = url[len("http://testserver"):] return url def extract_html_urls(content): for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content): if re.search(r'rel=["\']?nofollow["\']', m.group(1)): continue url = strip_url(m.group(2)) if len(url) > MAX_URL_LENGTH: continue # avoid infinite GET parameter appendages if not url.startswith("/"): continue yield url client = django.test.Client() for url in initial_urls: urls[url] = "[initial]" errors = 0 count = 0 start_time = datetime.datetime.now() fh, fn = tempfile.mkstemp(prefix="test-crawl-", suffix=".log", dir="../") logfile = open(fn, "w") os.close(fh) while urls: url, referrer = urls.popitem() visited.add(url) try: timestamp = datetime.datetime.now() r = client.get(url) elapsed = datetime.datetime.now() - timestamp except KeyboardInterrupt: print "was fetching", url sys.exit(1) except: print 500, "%.3fs" % (datetime.datetime.now() - timestamp).total_seconds(), url, "FAIL (from %s)" % referrer print "=============" print traceback.format_exc() print "=============" errors += 1 else: tags = [] if r.status_code in (301, 302): u = strip_url(r["Location"]) if u not in visited and u not in urls: urls[u] = referrer # referrer is original referrer, not redirected url elif r.status_code == 200: ctype = r["Content-Type"] if ";" in ctype: ctype = ctype[:ctype.index(";")] if ctype == "text/html": try: for u in extract_html_urls(r.content): if u not in visited and u not in urls: urls[u] = url except: print "error extracting HTML urls from", url print "=============" print traceback.format_exc() print "=============" else: tags.append(u"FAIL (from %s)" % referrer) errors += 1 if elapsed.total_seconds() > slow_threshold: tags.append("SLOW") acc_time = (timestamp - start_time).total_seconds() acc_secs = (timestamp - start_time).total_seconds() hrs = acc_secs // (60*60) min = (acc_secs % (60*60)) // 60 sec = acc_secs % 60 if (len(visited) % 100) == 1: print "" print "Elapsed Visited Queue Code Time Url ... Notes" logentry = "%s %.3fs %s %s" % (r.status_code, elapsed.total_seconds(), url, " ".join(tags)) print "%2d:%02d:%02d"%(hrs,min,sec), "%7d" % len(visited), "%6d" % len(urls), " ", logentry logfile.write(logentry+"\n") logfile.close() sys.stderr.write("Output written to %s" % logfile.name) if errors > 0: sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors) sys.exit(1)