179 lines
5.3 KiB
Python
Executable file
179 lines
5.3 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
import os, sys, re, datetime, argparse, traceback, tempfile
|
|
|
|
# args
|
|
parser = argparse.ArgumentParser(
|
|
description="""Perform a test crawl of the project. For each found URL, the HTTP
|
|
response status is printed. If it's not OK/redirect, FAIL is
|
|
printed - in case of errors, a stacktrace is also included.""")
|
|
parser.add_argument('urls', metavar='URL', nargs='*',
|
|
help='One or more URLs to start the crawl from')
|
|
parser.add_argument('--urls', '-u', dest='url_file',
|
|
help='file with URLs to start the crawl from')
|
|
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
|
|
help='responses taking longer than this (in seconds) results in SLOW being printed')
|
|
parser.add_argument('--settings', dest='settings', help='custom settings file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# boilerplate
|
|
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
|
|
sys.path = [ basedir ] + sys.path
|
|
|
|
settings_module = args.settings or "ietf.settings"
|
|
|
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", settings_module)
|
|
|
|
import django
|
|
import django.test
|
|
|
|
# prevent memory from leaking when settings.DEBUG=True
|
|
from django.db import connection
|
|
class DontSaveQueries(object):
|
|
def append(self, x):
|
|
pass
|
|
connection.queries = DontSaveQueries()
|
|
|
|
MAX_URL_LENGTH = 500
|
|
|
|
slow_threshold = args.slow_threshold
|
|
|
|
initial_urls = []
|
|
initial_urls.extend(args.urls)
|
|
|
|
if args.url_file:
|
|
with open(args.url_file) as f:
|
|
for line in f:
|
|
line = line.partition("#")[0].strip()
|
|
if line:
|
|
initial_urls.append(line)
|
|
|
|
if not initial_urls:
|
|
initial_urls.append("/")
|
|
|
|
visited = set()
|
|
urls = {} # url -> referrer
|
|
referrers = {}
|
|
|
|
def strip_url(url):
|
|
if url.startswith("http://testserver"):
|
|
url = url[len("http://testserver"):]
|
|
return url
|
|
|
|
def extract_html_urls(content):
|
|
for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content):
|
|
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
|
|
continue
|
|
|
|
url = strip_url(m.group(2))
|
|
if len(url) > MAX_URL_LENGTH:
|
|
continue # avoid infinite GET parameter appendages
|
|
|
|
if not url.startswith("/"):
|
|
continue
|
|
|
|
if url.startswith("//"):
|
|
continue
|
|
|
|
yield url
|
|
|
|
django.setup()
|
|
client = django.test.Client()
|
|
|
|
for url in initial_urls:
|
|
urls[url] = "[initial]"
|
|
|
|
errors = 0
|
|
count = 0
|
|
|
|
start_time = datetime.datetime.now()
|
|
fh, fn = tempfile.mkstemp(prefix="test-crawl-", suffix=".log", dir="../")
|
|
logfile = open(fn, "w")
|
|
os.close(fh)
|
|
|
|
def log(s):
|
|
print(s)
|
|
logfile.write(s)
|
|
logfile.write('\n')
|
|
|
|
def get_referrers(url):
|
|
ref_list = []
|
|
while url in referrers:
|
|
url = referrers[url]
|
|
if url in ref_list:
|
|
log("Circular referral list, discovered at %s" % url)
|
|
break
|
|
ref_list.append(url)
|
|
return ref_list
|
|
|
|
while urls:
|
|
url, referrer = urls.popitem()
|
|
|
|
visited.add(url)
|
|
|
|
try:
|
|
timestamp = datetime.datetime.now()
|
|
r = client.get(url)
|
|
elapsed = datetime.datetime.now() - timestamp
|
|
except KeyboardInterrupt:
|
|
log(" ... was fetching %s" % url)
|
|
sys.exit(1)
|
|
except:
|
|
log("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime.datetime.now() - timestamp).total_seconds(), url, (",\n\t".join(get_referrers(url)))))
|
|
log("=============")
|
|
log(traceback.format_exc())
|
|
log("=============")
|
|
errors += 1
|
|
else:
|
|
tags = []
|
|
|
|
if r.status_code in (301, 302):
|
|
u = strip_url(r["Location"])
|
|
if u not in visited and u not in urls:
|
|
urls[u] = referrer # referrer is original referrer, not redirected url
|
|
referrers[u] = referrer
|
|
|
|
elif r.status_code == 200:
|
|
ctype = r["Content-Type"]
|
|
if ";" in ctype:
|
|
ctype = ctype[:ctype.index(";")]
|
|
|
|
if ctype == "text/html":
|
|
try:
|
|
for u in extract_html_urls(r.content):
|
|
if u not in visited and u not in urls:
|
|
urls[u] = url
|
|
referrers[u] = url
|
|
except:
|
|
log("error extracting HTML urls from %s" % url)
|
|
log("=============")
|
|
log(traceback.format_exc())
|
|
log("=============")
|
|
else:
|
|
tags.append(u"FAIL (from %s)" % referrer)
|
|
errors += 1
|
|
|
|
if elapsed.total_seconds() > slow_threshold:
|
|
tags.append("SLOW")
|
|
|
|
acc_time = (timestamp - start_time).total_seconds()
|
|
acc_secs = (timestamp - start_time).total_seconds()
|
|
hrs = acc_secs // (60*60)
|
|
min = (acc_secs % (60*60)) // 60
|
|
sec = acc_secs % 60
|
|
|
|
if (len(visited) % 100) == 1:
|
|
log("\nElapsed Visited Queue Code Time Url ... Notes")
|
|
|
|
log("%2d:%02d:%02d %7d %6d %s %.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
|
|
|
|
logfile.close()
|
|
sys.stderr.write("Output written to %s\n\n" % logfile.name)
|
|
|
|
if errors > 0:
|
|
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
|
|
sys.exit(1)
|
|
else:
|
|
sys.stderr.write("Found no errors.")
|