datatracker/bin/test-crawl

640 lines
25 KiB
Python
Executable file

#!/usr/bin/env python
# -*- indent-with-tabs: 0 -*-
# Copyright The IETF Trust 2013-2022, All Rights Reserved
import argparse
import datetime
import difflib
import html5lib
import json
import os
import random
import re
import requests
import sys
import time
import traceback
import urllib.parse
import warnings
warnings.filterwarnings("ignore", message=r"group\.HistoricalGroupFeatures\.\w+ failed to load invalid json")
# Set up import path to find our own Django
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
if not basedir in sys.path:
sys.path.insert(0, basedir)
# Parse args now, so we can use custom settings when importing django
parser = argparse.ArgumentParser(
description="""Perform a test crawl of the project. For each found URL, the HTTP
response status is printed. If it's not OK/redirect, FAIL is
printed - in case of errors, a stacktrace is also included.""")
parser.add_argument('urls', metavar='URL', nargs='*',
help='One or more URLs to start the crawl from')
parser.add_argument('--diff', dest='exthost', type=str, metavar="SITE", help='Diff pages against external site')
parser.add_argument('--failfast', action='store_true',
help='Stop the crawl on the first page failure')
parser.add_argument('--logfile', help='Write to logfile')
parser.add_argument('--no-follow', dest='follow', action='store_false', default=True,
help='Do not follow URLs found in fetched pages, just check the given URLs')
parser.add_argument('--pedantic', action='store_true',
help='Stop the crawl on the first error or warning')
parser.add_argument('--random', action='store_true',
help='Crawl URLs randomly')
parser.add_argument('-R', '--no-revisit', action='store_true', default=False, help="Don't revisit already visited URLs")
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='Responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', help='Custom settings file')
parser.add_argument('--urls', '-u', dest='url_file',
help='File with URLs to start the crawl from')
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
help='Run html 5 validation on all pages, without skipping similar urls. '
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
parser.add_argument('--skip-html-validation', dest='skip_html_validation', action='store_true', help='Skip HTML validation.',default=False)
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='Be more verbose')
parser.add_argument('-x', '--exclude', action='append', default=[], help="Exclude URLs matching pattern")
parser.add_argument('-X', '--exclude-from', metavar='FILE', help="URL exclusion pattern file")
args = parser.parse_args()
# Import Django, call setup()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", args.settings or "ietf.settings_testcrawl")
import django
import django.test
import django.core.checks
from django.conf import settings
from django.utils import timezone
django.setup()
# This needs to come after we set up sys path to include the local django
import debug # pyflakes:ignore
# prevent memory from leaking when settings.DEBUG=True
# from django.db import connection
# class DontSaveQueries(list):
# def append(self, x):
# pass
# connection.queries = DontSaveQueries()
from ietf.name.models import DocTypeName
from ietf.utils.html import unescape
from ietf.utils.test_utils import unicontent
from ietf.utils.test_runner import start_vnu_server, vnu_validate, vnu_fmt_message, vnu_filter_message
# --- Constants ---
MAX_URL_LENGTH = 500
# --- Functions ---
def note(s):
if args.verbose:
sys.stderr.write(s)
sys.stderr.write('\n')
def strip_url(url):
if url.startswith("http://testserver"):
url = url[len("http://testserver"):]
fragment_url = re.search(r"^(.+)#[a-z_.-]+$", url)
if fragment_url:
url = fragment_url.group(1)
next_url = re.search(r"^(.+)\?next=.+$", url)
if next_url:
url = next_url.group(1)
return url
def extract_html_urls(content):
for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content):
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
continue
url = strip_url(m.group(2))
if len(url) > MAX_URL_LENGTH:
continue # avoid infinite GET parameter appendages
if not url.startswith("/"):
continue
if url.startswith("//"):
continue
yield unescape(url)
def extract_tastypie_urls(content):
VISIT_OBJECTS = False
VISIT_NEXT = False
data = json.loads(content)
for item in data:
if type(data[item]) is dict:
if "list_endpoint" in data[item]:
uri = data[item]["list_endpoint"]
yield uri
if VISIT_NEXT:
if "meta" in data and "next" in data["meta"]:
uri = data["meta"]["next"]
if uri != None:
yield uri
if VISIT_OBJECTS:
if "objects" in data:
object_list = data["objects"]
for i in range(len(object_list)):
if "resource_uri" in object_list[i]:
uri = object_list[i]["resource_uri"]
yield uri
def check_html_valid(url, response, args):
if args.skip_html_validation:
return
global parser, validated_urls, doc_types, warnings
key = url
if not args.validate_all:
# derive a key for urls like this by replacing primary keys
key = re.sub("#.*$", "", key)
key = re.sub("/.+@.+/", "/x@x.org/", key)
key = re.sub("/[0-9.]+/", "/mmmm/", key)
key = re.sub("/[0-9.]+/", "/nnnn/", key)
key = re.sub("/ag/[a-z0-9-]+/", "/ag/foo/", key)
key = re.sub("/area/[a-z0-9-]+/", "/area/foo/", key)
key = re.sub("/bcp[0-9]+/", "/bcpnnn/", key)
key = re.sub("/conflict-review-[a-z0-9-]+/", "/conflrev-foo/", key)
key = re.sub("/dir/[a-z0-9-]+/", "/dir/foo/", key)
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
key = re.sub("/group/[a-z0-9-]+/", "/group/foo/", key)
key = re.sub("/html/[a-z0-9-]+", "/html/foo/", key)
key = re.sub("/ipr/search/.*", "/ipr/search/", key)
key = re.sub("/meeting/[-0-9a-z]+/agenda/[0-9a-z]+/", "/meeting/nn/agenda/foo/", key)
key = re.sub("/release/[0-9dev.]+/", "/release/n.n.n/", key)
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
key = re.sub("/secr/srec/nnnn/[0-9a-z-]+/", "/secr/sreq/nn/bar/", key)
key = re.sub("/state/[a-z0-9-]+/", "/state/foo/", key)
key = re.sub("/state/[a-z0-9-]+/[a-z0-9-]+/", "/state/foo/bar/", key)
key = re.sub("/status-change-[a-z0-9-]+/", "/statchg-foo/", key)
key = re.sub("/std[0-9]+/", "/stdnnn/", key)
key = re.sub("/submit/status/nnnn/[0-9a-f]+/", "/submit/status/nnnn/bar/", key)
key = re.sub("/team/[a-z0-9-]+/", "/team/foo/", key)
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
key = re.sub(r"\?.*$", "", key)
for slug in doc_types:
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
if not key in validated_urls:
note('Validate: %-32s: %s' % (url[:32], key))
if hasattr(response, "content"):
content = response.content
else:
content = response.streaming_content
validated_urls[key] = True
if args.validator_nu:
ret = vnu_validate(content, response["Content-Type"], port=8887)
assert ret
for m in json.loads(ret)["messages"]:
if "lastLine" not in m:
tag = m["message"]
else:
tag = vnu_fmt_message(url, m, content.decode())
# disregard some HTML issues that are (usually) due to invalid
# database content
if not vnu_filter_message(m, True, False):
warnings += 1
tags.append(tag)
else:
try:
parser.parse(content)
except Exception as e:
for err in parser.errors:
pos, _, _ = err
tags.append("WARN invalid html at line, pos {}: {}".format(pos, e))
warnings += 1
def skip_extract_from(url):
for pattern in (
r'^/doc/html/[a-z0-9-]+',
r'^/meeting/[a-z0-9-]+/agenda/[a-z0-9-]+',
r'^/static/coverage/',
):
if re.search(pattern, url):
return True
return False
def skip_url(url):
for pattern in skip_patterns + args.exclude:
if pattern.search(url):
return True
return False
def log(s):
print(s)
if logfile:
if not type(s) is str:
s = s.encode('utf-8')
logfile.write(s)
logfile.write('\n')
def get_referrers(url):
ref_list = []
while url in referrers:
url = referrers[url]
if url in ref_list:
log("Circular referral list, discovered at %s" % url)
break
ref_list.append(url)
return ref_list
nowtime = None
nowstrn = None
def nowstr():
global nowtime, nowstrn
t = time.time_ns()/(10**9)
if nowtime != t:
nowtime = t
nowstrn = timezone.now().strftime('%H:%M:%S').encode()
return nowstrn
b_exthost = re.sub(b'https?', b'', args.exthost.encode()) if args.exthost else None
def normalize_for_diff(page):
# pages containing 'current time' can differ if they're fetched on different seconds:
#page = page.replace(nowstr(), b'00:00:00')
page = page.replace(b'https://', b'http://')
# regex replacements
page = re.sub(b'<!--.*?-->', b'', page)
page = re.sub(b' -- (Test|Development) Mode', b'', page)
page = re.sub(b'\n\s*\n+\s*', b'\n', page)
page = re.sub(b'name="csrfmiddlewaretoken" value="\w+"', b'', page)
page = re.sub(b'urn:uuid:[0-9a-f-]+', b'urn:uuid:00000000-0000-0000-0000-000000000000', page)
page = re.sub(b'<updated>[0-9T.:+-]+</updated>', b'', page)
page = re.sub(b'<published>[0-9T.:+-]+</published>', b'', page)
if b_exthost:
page = re.sub(b_exthost, b'://testserver/', page)
return page
def get_differences(a, b):
#open('a.html','wb').write(a)
#open('b.html','wb').write(b)
a = a.decode().splitlines()
b = b.decode().splitlines()
for group in difflib.SequenceMatcher(None,a,b).get_grouped_opcodes():
for tag, i1, i2, j1, j2 in group:
if tag == 'equal':
pass
elif tag == 'replace':
# see if the lines have next-day date matches
if i2-i1 == j2-j1:
matches = []
for i in range(i2-i1):
aline = a[i1+i]
bline = b[j1+i]
if len(aline) == len(bline):
adates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', aline))
bdates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', bline))
# See if all date matches are in the same places
if len(adates) and [ match.start() for match in adates ] == [ match.start() for match in bdates ]:
# try to transform a into b by date shifting
adates = [ match[0] for match in adates ]
bdates = [ match[0] for match in bdates ]
for i in range(len(adates)):
if adates[i] != bdates[i]:
d = datetime.datetime.strptime(adates[i], '%Y-%m-%d').date()
# shift date
d += datetime.timedelta(days=1)
adates[i] = d.strftime('%Y-%m-%d')
matches.append(adates == bdates)
else:
matches = [ False ]
if not all(matches):
for line in a[i1:i2]:
yield '-' + line
for line in b[j1:j2]:
yield '+' + line
elif tag == 'delete':
for line in a[i1:i2]:
yield '-' + line
elif tag == 'insert':
for line in b[j1:j2]:
yield '+' + line
# --- GLobals ---
slow_threshold = args.slow_threshold
visited_fn = 'visited.json'
visited = set()
if args.no_revisit:
if os.path.exists(visited_fn):
with open(visited_fn, "r") as f:
visited = set(json.load(f))
else:
if os.path.exists(visited_fn):
os.unlink(visited_fn)
urls_fn = 'urls.json'
urls = {} # url -> referrer
if args.no_revisit:
if os.path.exists(urls_fn):
with open(urls_fn, "r") as f:
urls = json.load(f)
else:
if os.path.exists(urls_fn):
os.unlink(urls_fn)
referrers = {}
initial_urls = []
initial_urls.extend(args.urls)
if args.url_file:
with open(args.url_file) as f:
for line in f:
line = line.partition("#")[0].strip()
if line:
initial_urls.append(line)
if not initial_urls:
initial_urls.append("/")
initial_urls.append("/api/v1")
for url in initial_urls:
urls[url] = "[initial]"
if args.exclude_from:
with open(args.exclude_from) as f:
args.exclude += [ l.strip() for l in f.readlines() ]
args.exclude = [ re.compile(p) for p in args.exclude if p ]
# pre-set exclusion patterns
skip_patterns = [
r"^/community/[0-9]+/remove_document/",
r"^/community/personal/",
# Skip most of the slow pdf composite generation urls and svg urls
r"^/meeting/[0-9]+/agenda/[0-9b-z].*-drafts\\.pdf",
r"^/wg/[a-z0-9-]+/deps/svg/",
# This bad url occurs in an uploaded html agenda:
r"/site/ietfdhcwg/_/rsrc/1311005436000/system/app/css/overlay.css\?cb=simple100%250150goog-ws-left",
r"/dir/tsvdir/reviews/",
r"draft-touch-msword-template-v2\.0",
# There is a long list of urls that will always 404, but we include only those not excluded above
r"^/doc/html/draft-balakrishnan-cm-03",
r"^/doc/html/draft-ballardie-cbt-02",
#
r"^/doc/html/draft-[0-9ac-z]",
r"^/doc/html/draft-b[0-9b-z]",
r"^/doc/pdf/draft-[0-9ac-z]",
r"^/doc/pdf/draft-b[0-9b-z]",
r"^/doc/html/charter-.*",
r"^/doc/html/status-.*",
r"^/doc/html/rfc.*",
r"^/static/coverage/",
r"^/meeting/\d{,2}/agenda", # no agendas < 100
]
skip_patterns = [ re.compile(p) for p in skip_patterns ]
parser = html5lib.HTMLParser(strict=True)
# initialise validated_urls with some patterns we don't want to check,
# because they aren't under our control, such as uploaded group agendas.
validated_urls = {'/meeting/nn/agenda/foo/': True, }
doc_types = [ t.slug for t in DocTypeName.objects.all() ]
errors = 0
warnings = 0
count = 0
start_time = datetime.datetime.now()
client = django.test.Client(Accept='text/html,text/plain,application/json')
logfile = None
if args.logfile:
logfile = open(args.logfile, "w")
if args.exthost:
curdir = './cur/'
os.makedirs(curdir, exist_ok=True)
extdir = './ext/'
os.makedirs(extdir, exist_ok=True)
vnu = None
# --- Main ---
def do_exit(code):
if vnu:
vnu.terminate()
sys.exit(code)
if __name__ == "__main__":
if (args.user):
# log in as user, to have the respective HTML generated by the templates
response = client.post('/accounts/login/',
{'username': args.user, 'password': 'password'},
secure=True, follow=True)
if (response.status_code != 200):
log("Could not log in as %s, HTML response %d" %
(args.user, response.status_code))
do_exit(1)
# Run django system checks and checks from ietf.checks:
error_list = django.core.checks.run_checks()
silenced = []
for i in range(len(error_list)):
if error_list[i].id in settings.SILENCED_SYSTEM_CHECKS:
silenced.append(i)
silenced.sort(reverse=True)
for i in silenced:
del error_list[i]
if error_list:
print("")
for entry in error_list:
print(entry)
if args.validator_nu:
vnu = start_vnu_server(port=8887)
try:
while urls:
if args.random:
url = random.choice(list(urls.keys()))
referrer = urls.pop(url)
else:
url, referrer = urls.popitem()
visited.add(url)
if (len(visited) % 100) == 1:
if args.exthost:
# Check that we have the same dump on both sides
exturl = urllib.parse.urljoin(args.exthost, 'api/version')
extres = requests.get(exturl)
extdumptime = extres.json()['dumptime']
intres = client.get('/api/version')
intdumptime = intres.json()['dumptime']
if extdumptime != intdumptime:
sys.stderr.write("Was trying to diff output from different dumps:\n"
f" External site dump time: {extdumptime}\n"
f" Internal site dump time: {intdumptime}\n")
sys.exit(2)
log("\nElapsed Visited Queue Code Time Url ... Notes")
if (len(visited) % 1000) == 0:
with open(visited_fn, "w") as f:
json.dump(list(visited), f, indent=1)
with open(urls_fn, "w") as f:
json.dump(urls, f, indent=1)
if skip_url(url):
continue
timestamp = datetime.datetime.now()
acc_time = (timestamp - start_time).total_seconds()
acc_secs = (timestamp - start_time).total_seconds()
hrs = acc_secs // (60*60)
min = (acc_secs % (60*60)) // 60
sec = acc_secs % 60
try:
request_start = datetime.datetime.now()
if args.verbose:
sys.stderr.write(url+'\n')
r = client.get(url, secure=True)
elapsed = datetime.datetime.now() - request_start
except KeyboardInterrupt:
log(" ... was fetching %s" % url)
visited.remove(url)
do_exit(1)
except:
elapsed = datetime.datetime.now() - request_start
tags = [ "FAIL (from [ %s ])" % (",\n\t".join(get_referrers(url))) ]
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), 500, elapsed.total_seconds(), url, " ".join(tags)))
log("=============")
log(traceback.format_exc())
log("=============")
errors += 1
else:
tags = []
ctype = r["Content-Type"]
if r.status_code in (301, 302):
u = strip_url(r["Location"])
if not url.startswith("/") and u not in visited and u not in urls:
urls[u] = referrer # referrer is original referrer, not redirected url
referrers[u] = referrer
ctype = ''
elif r.status_code == 200:
ctype = r["Content-Type"]
if ";" in ctype:
ctype = ctype[:ctype.index(";")]
if ctype == "text/html":
try:
if args.follow and not skip_extract_from(url):
for u in extract_html_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
check_html_valid(url, r, args)
except:
log("error extracting HTML urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
elif ctype == "application/json" and url.startswith('/api/v1/'):
try:
if args.follow:
for u in extract_tastypie_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
except:
log("error extracting urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
if args.exthost and ctype not in ('application/json', 'application/pdf',
'application/x-gtar', 'application/octet-stream', ):
urlpath = urllib.parse.urljoin(args.exthost, url)
try:
x = requests.get(urlpath)
if hasattr(r, 'content') and hasattr(x, 'content'):
# Remove comments (which can contain template paths and
# version strings, and do some newline normalization:
cur = normalize_for_diff(r.content)
ext = normalize_for_diff(x.content)
#
if cur != ext:
try:
diff = list(get_differences(ext, cur))
except Exception as e:
log(f"Error computing diff for {url} ({ctype}):\n {e}")
sys.exit(1)
if diff:
fn = url.strip('/').replace('/', '_') or 'root'
with open(curdir+fn, 'wb') as f:
f.write(cur)
with open(extdir+fn, 'wb') as f:
f.write(ext)
tags.append(f"DIFF ({len(diff)} lines)")
if args.failfast:
sys.stderr.write('\n'.join(diff)+'\n')
sys.exit(1)
except requests.exceptions.ConnectionError as e:
sys.exit(e)
else:
tags.append("FAIL (from {})".format(referrer))
if not url.startswith("/person/"): # FIXME: those fail sometimes
errors += 1
if elapsed.total_seconds() > slow_threshold:
tags.append("SLOW")
acc_time = (timestamp - start_time).total_seconds()
acc_secs = (timestamp - start_time).total_seconds()
hrs = acc_secs // (60*60)
min = (acc_secs % (60*60)) // 60
sec = acc_secs % 60
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
if ((errors or warnings) and args.pedantic):
log(f"Errors : {errors}")
log(f"Warnings: {warnings}")
do_exit(1)
if logfile:
logfile.close()
sys.stderr.write("Output written to %s\n\n" % logfile.name)
if errors > 0:
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
do_exit(1)
else:
sys.stderr.write("Found no errors.\n")
if warnings > 0:
sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
else:
sys.stderr.write("Found no warnings.\n")
finally:
if args.no_revisit:
sys.stderr.write("Saving list of visited URLs\n")
with open(visited_fn, "w") as f:
json.dump(list(visited), f, indent=1)
with open(urls_fn, "w") as f:
json.dump(urls, f, indent=1)