Add option to crawl as a logged-in user (--user). Add --pedantic option for vnu crawl, which stops the crawl on (most) errors. Randomize the order in which URLs are crawled, so that repeated crawls don't hit the same URLs in the same order. - Legacy-Id: 9785 Note: SVN reference [9765] has been migrated to Git commit 9b4e61049a704127e1200549fcc410326efffddb
328 lines
12 KiB
Python
Executable file
328 lines
12 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
|
|
import html5lib
|
|
import debug # pyflakes:ignore
|
|
import random
|
|
|
|
# Set up import path to find our own Django
|
|
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
|
|
if not basedir in sys.path:
|
|
sys.path.insert(0, basedir)
|
|
|
|
# Parse args now, so we can use custom settings when importing django
|
|
parser = argparse.ArgumentParser(
|
|
description="""Perform a test crawl of the project. For each found URL, the HTTP
|
|
response status is printed. If it's not OK/redirect, FAIL is
|
|
printed - in case of errors, a stacktrace is also included.""")
|
|
parser.add_argument('urls', metavar='URL', nargs='*',
|
|
help='One or more URLs to start the crawl from')
|
|
parser.add_argument('--urls', '-u', dest='url_file',
|
|
help='File with URLs to start the crawl from')
|
|
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
|
|
help='Responses taking longer than this (in seconds) results in SLOW being printed')
|
|
parser.add_argument('--settings', help='Custom settings file')
|
|
parser.add_argument('--logfile', help='Write to logfile')
|
|
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
|
|
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
|
|
help='Use validator.nu instead of html5lib for HTML validation')
|
|
parser.add_argument('--pedantic', action='store_true',
|
|
help='Stop the crawl on the first HTML validation issue')
|
|
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
|
|
help='Run html 5 validation on all pages, without skipping similar urls. '
|
|
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Import Django, call setup()
|
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", args.settings or "ietf.settings")
|
|
|
|
import django
|
|
import django.test
|
|
|
|
django.setup()
|
|
|
|
# prevent memory from leaking when settings.DEBUG=True
|
|
from django.db import connection
|
|
class DontSaveQueries(object):
|
|
def append(self, x):
|
|
pass
|
|
connection.queries = DontSaveQueries()
|
|
|
|
from ietf.name.models import DocTypeName
|
|
|
|
# --- Constants ---
|
|
|
|
MAX_URL_LENGTH = 500
|
|
|
|
# --- Functions ---
|
|
|
|
def strip_url(url):
|
|
if url.startswith("http://testserver"):
|
|
url = url[len("http://testserver"):]
|
|
return url
|
|
|
|
def extract_html_urls(content):
|
|
for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content):
|
|
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
|
|
continue
|
|
|
|
url = strip_url(m.group(2))
|
|
if len(url) > MAX_URL_LENGTH:
|
|
continue # avoid infinite GET parameter appendages
|
|
|
|
if not url.startswith("/"):
|
|
continue
|
|
|
|
if url.startswith("//"):
|
|
continue
|
|
|
|
yield url
|
|
|
|
def extract_tastypie_urls(content):
|
|
VISIT_OBJECTS = False
|
|
VISIT_NEXT = False
|
|
data = json.loads(content)
|
|
for item in data:
|
|
if type(data[item]) is dict:
|
|
if "list_endpoint" in data[item]:
|
|
uri = data[item]["list_endpoint"]
|
|
yield uri
|
|
if VISIT_NEXT:
|
|
if "meta" in data and "next" in data["meta"]:
|
|
uri = data["meta"]["next"]
|
|
if uri != None:
|
|
yield uri
|
|
if VISIT_OBJECTS:
|
|
if "objects" in data:
|
|
object_list = data["objects"]
|
|
for i in range(len(object_list)):
|
|
if "resource_uri" in object_list[i]:
|
|
uri = object_list[i]["resource_uri"]
|
|
yield uri
|
|
|
|
def check_html_valid(url, response, args):
|
|
global parser, validated_urls, doc_types, warnings
|
|
# These URLs have known issues, skip them until those are fixed
|
|
if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
|
|
log("%s blacklisted; skipping HTML validation" % url)
|
|
return
|
|
key = url
|
|
if not args.validate_all:
|
|
# derive a key for urls like this by replacing primary keys
|
|
key = re.sub("/[0-9.]+/", "/nnnn/", key)
|
|
key = re.sub("/.+@.+/", "/x@x.org/", key)
|
|
key = re.sub("#.*$", "", key)
|
|
key = re.sub("\?.*$", "", key)
|
|
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
|
|
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
|
|
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
|
|
key = re.sub("/ipr/[0-9]+/", "/ipr/nnnn/", key)
|
|
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
|
|
for slug in doc_types:
|
|
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
|
|
|
|
if not key in validated_urls:
|
|
if hasattr(response, "content"):
|
|
content = response.content
|
|
else:
|
|
content = response.streaming_content
|
|
validated_urls[key] = True
|
|
if args.validator_nu:
|
|
v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
|
|
"--format", "json", "-"],
|
|
stdin=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
for m in json.loads(v.communicate(content)[1])["messages"]:
|
|
t = m["subType"] if m["type"] == "info" else m["type"]
|
|
tags.append("\n%s\tLine %d: %s" %
|
|
(t.upper(), m["lastLine"], m["message"]))
|
|
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
|
|
tags.append("\n\t%s%s" %
|
|
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
|
|
# disregard some HTML issues that are (usually) due to invalid
|
|
# database content
|
|
if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
|
|
warnings += 1
|
|
else:
|
|
try:
|
|
parser.parse(content)
|
|
except Exception:
|
|
e = SyntaxWarning("ParseError")
|
|
for err in parser.errors:
|
|
pos, code, data = err
|
|
tags.append(u"WARN invalid html: Position %s: %s" %
|
|
(pos, code))
|
|
warnings += 1
|
|
|
|
|
|
def log(s):
|
|
print(s)
|
|
if logfile:
|
|
if not type(s) is str:
|
|
s = s.encode('utf-8')
|
|
logfile.write(s)
|
|
logfile.write('\n')
|
|
|
|
def get_referrers(url):
|
|
ref_list = []
|
|
while url in referrers:
|
|
url = referrers[url]
|
|
if url in ref_list:
|
|
log("Circular referral list, discovered at %s" % url)
|
|
break
|
|
ref_list.append(url)
|
|
return ref_list
|
|
|
|
# --- GLobals ---
|
|
|
|
slow_threshold = args.slow_threshold
|
|
|
|
|
|
|
|
visited = set()
|
|
urls = {} # url -> referrer
|
|
referrers = {}
|
|
|
|
initial_urls = []
|
|
initial_urls.extend(args.urls)
|
|
|
|
if args.url_file:
|
|
with open(args.url_file) as f:
|
|
for line in f:
|
|
line = line.partition("#")[0].strip()
|
|
if line:
|
|
initial_urls.append(line)
|
|
|
|
if not initial_urls:
|
|
initial_urls.append("/")
|
|
initial_urls.append("/api/v1")
|
|
|
|
for url in initial_urls:
|
|
urls[url] = "[initial]"
|
|
|
|
parser = html5lib.HTMLParser(strict=True)
|
|
|
|
validated_urls = {}
|
|
|
|
doc_types = [ t.slug for t in DocTypeName.objects.all() ]
|
|
|
|
errors = 0
|
|
warnings = 0
|
|
count = 0
|
|
|
|
start_time = datetime.datetime.now()
|
|
|
|
client = django.test.Client(Accept='text/html,text/plain,application/json')
|
|
|
|
logfile = None
|
|
if args.logfile:
|
|
logfile = open(args.logfile, "w")
|
|
|
|
# --- Main ---
|
|
|
|
if __name__ == "__main__":
|
|
if (args.user):
|
|
# log in as user, to have the respective HTML generated by the templates
|
|
response = client.post('/accounts/login/',
|
|
{'username': args.user, 'password': 'password'},
|
|
secure=True, follow=True)
|
|
if (response.status_code != 200):
|
|
log("Could not log in as %s, HTML response %d" %
|
|
(args.user, response.status_code))
|
|
sys.exit(1)
|
|
|
|
while urls:
|
|
# popitem() is documented to be random, but really isn't
|
|
url = random.choice(urls.keys())
|
|
referrer = urls.pop(url)
|
|
|
|
visited.add(url)
|
|
|
|
try:
|
|
timestamp = datetime.datetime.now()
|
|
r = client.get(url, secure=True, follow=True)
|
|
elapsed = datetime.datetime.now() - timestamp
|
|
except KeyboardInterrupt:
|
|
log(" ... was fetching %s" % url)
|
|
sys.exit(1)
|
|
except:
|
|
log("500 %.3fs %s FAIL (from: [ %s ])" % ((datetime.datetime.now() - timestamp).total_seconds(), url, (",\n\t".join(get_referrers(url)))))
|
|
log("=============")
|
|
log(traceback.format_exc())
|
|
log("=============")
|
|
errors += 1
|
|
else:
|
|
tags = []
|
|
|
|
if r.status_code in (301, 302):
|
|
u = strip_url(r["Location"])
|
|
if u not in visited and u not in urls:
|
|
urls[u] = referrer # referrer is original referrer, not redirected url
|
|
referrers[u] = referrer
|
|
|
|
elif r.status_code == 200:
|
|
ctype = r["Content-Type"]
|
|
if ";" in ctype:
|
|
ctype = ctype[:ctype.index(";")]
|
|
|
|
if ctype == "text/html":
|
|
try:
|
|
for u in extract_html_urls(r.content):
|
|
if u not in visited and u not in urls:
|
|
urls[u] = url
|
|
referrers[u] = url
|
|
|
|
check_html_valid(url, r, args)
|
|
|
|
except:
|
|
log("error extracting HTML urls from %s" % url)
|
|
log("=============")
|
|
log(traceback.format_exc())
|
|
log("=============")
|
|
|
|
elif ctype == "application/json":
|
|
try:
|
|
for u in extract_tastypie_urls(r.content):
|
|
if u not in visited and u not in urls:
|
|
urls[u] = url
|
|
referrers[u] = url
|
|
except:
|
|
log("error extracting urls from %s" % url)
|
|
log("=============")
|
|
log(traceback.format_exc())
|
|
log("=============")
|
|
|
|
else:
|
|
tags.append(u"FAIL for %s\n (from %s)" % (url, referrer))
|
|
errors += 1
|
|
|
|
if elapsed.total_seconds() > slow_threshold:
|
|
tags.append("SLOW")
|
|
|
|
acc_time = (timestamp - start_time).total_seconds()
|
|
acc_secs = (timestamp - start_time).total_seconds()
|
|
hrs = acc_secs // (60*60)
|
|
min = (acc_secs % (60*60)) // 60
|
|
sec = acc_secs % 60
|
|
|
|
if (len(visited) % 100) == 1:
|
|
log("\nElapsed Visited Queue Code Time Url ... Notes")
|
|
|
|
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
|
|
if ((errors or warnings) and args.pedantic):
|
|
sys.exit(1)
|
|
|
|
if logfile:
|
|
logfile.close()
|
|
sys.stderr.write("Output written to %s\n\n" % logfile.name)
|
|
|
|
if errors > 0:
|
|
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
|
|
sys.exit(1)
|
|
else:
|
|
sys.stderr.write("Found no errors.\n")
|
|
if warnings > 0:
|
|
sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
|
|
else:
|
|
sys.stderr.write("Found no warnings.\n")
|