datatracker/ietf/bin/test-crawl

134 lines
3.9 KiB
Python
Executable file

#!/usr/bin/env python
import os, sys, re, datetime, argparse, traceback
# boilerplate
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ietf.settings")
import django.test
from django.conf import settings
# prevent memory from leaking when settings.DEBUG=True
from django.db import connection
class DontSaveQueries(object):
def append(self, x):
pass
connection.queries = DontSaveQueries()
MAX_URL_LENGTH = 500
# args
parser = argparse.ArgumentParser(
description="""Perform a test crawl of the project. For each found URL, the HTTP
response status is printed. If it's not OK/redirect, FAIL is
printed - in case of errors, a stacktrace is also included.""")
parser.add_argument('urls', metavar='URL', nargs='*',
help='One or more URLs to start the crawl from')
parser.add_argument('--urls', '-u', dest='url_file',
help='file with URLs to start the crawl from')
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='responses taking longer than this (in seconds) results in SLOW being printed')
args = parser.parse_args()
slow_threshold = args.slow_threshold
initial_urls = []
initial_urls.extend(args.urls)
if args.url_file:
with open(args.url_file) as f:
for line in f:
line = line.partition("#")[0].strip()
if line:
initial_urls.append(line)
if not initial_urls:
initial_urls.append("/")
visited = set()
urls = {} # url -> referrer
def strip_url(url):
if url.startswith("http://testserver"):
url = url[len("http://testserver"):]
return url
def extract_html_urls(content):
for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content):
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
continue
url = strip_url(m.group(2))
if len(url) > MAX_URL_LENGTH:
continue # avoid infinite GET parameter appendages
if not url.startswith("/"):
continue
yield url
client = django.test.Client()
for url in initial_urls:
urls[url] = "[initial]"
errors = 0
while urls:
url, referrer = urls.popitem()
visited.add(url)
try:
timestamp = datetime.datetime.now()
r = client.get(url)
elapsed = datetime.datetime.now() - timestamp
except KeyboardInterrupt:
print "was fetching", url
sys.exit(1)
except:
print 500, "%.3fs" % (datetime.datetime.now() - timestamp).total_seconds(), url, "FAIL (from %s)" % referrer
print "============="
print traceback.format_exc()
print "============="
errors += 1
else:
tags = []
if r.status_code in (301, 302):
u = strip_url(r["Location"])
if u not in visited and u not in urls:
urls[u] = referrer # referrer is original referrer, not redirected url
elif r.status_code == 200:
ctype = r["Content-Type"]
if ";" in ctype:
ctype = ctype[:ctype.index(";")]
if ctype == "text/html":
try:
for u in extract_html_urls(r.content):
if u not in visited and u not in urls:
urls[u] = url
except:
print "error extracting HTML urls from", url
print "============="
print traceback.format_exc()
print "============="
else:
tags.append(u"FAIL (from %s)" % referrer)
errors += 1
if elapsed.total_seconds() > slow_threshold:
tags.append("SLOW")
print r.status_code, "%.3fs" % elapsed.total_seconds(), url, " ".join(tags)
if errors > 0:
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
sys.exit(1)