#!/usr/bin/env python # -*- indent-with-tabs: 0 -*- # Copyright The IETF Trust 2013-2022, All Rights Reserved import argparse import datetime import difflib import html5lib import json import os import random import re import requests import sys import time import traceback import urllib.parse import warnings warnings.filterwarnings("ignore", message=r"group\.HistoricalGroupFeatures\.\w+ failed to load invalid json") # Set up import path to find our own Django basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../")) if not basedir in sys.path: sys.path.insert(0, basedir) # Parse args now, so we can use custom settings when importing django parser = argparse.ArgumentParser( description="""Perform a test crawl of the project. For each found URL, the HTTP response status is printed. If it's not OK/redirect, FAIL is printed - in case of errors, a stacktrace is also included.""") parser.add_argument('urls', metavar='URL', nargs='*', help='One or more URLs to start the crawl from') parser.add_argument('--diff', dest='exthost', type=str, metavar="SITE", help='Diff pages against external site') parser.add_argument('--failfast', action='store_true', help='Stop the crawl on the first page failure') parser.add_argument('--logfile', help='Write to logfile') parser.add_argument('--no-follow', dest='follow', action='store_false', default=True, help='Do not follow URLs found in fetched pages, just check the given URLs') parser.add_argument('--pedantic', action='store_true', help='Stop the crawl on the first error or warning') parser.add_argument('--random', action='store_true', help='Crawl URLs randomly') parser.add_argument('-R', '--no-revisit', action='store_true', default=False, help="Don't revisit already visited URLs") parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0, help='Responses taking longer than this (in seconds) results in SLOW being printed') parser.add_argument('--settings', help='Custom settings file') parser.add_argument('--urls', '-u', dest='url_file', help='File with URLs to start the crawl from') parser.add_argument('--user', help='Crawl logged in as this user', default=None) parser.add_argument('--validator-nu', dest='validator_nu', action='store_true', help='Use validator.nu instead of html5lib for HTML validation') parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False, help='Run html 5 validation on all pages, without skipping similar urls. ' '(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)') parser.add_argument('--skip-html-validation', dest='skip_html_validation', action='store_true', help='Skip HTML validation.',default=False) parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Be more verbose') parser.add_argument('-x', '--exclude', action='append', default=[], help="Exclude URLs matching pattern") parser.add_argument('-X', '--exclude-from', metavar='FILE', help="URL exclusion pattern file") args = parser.parse_args() # Import Django, call setup() os.environ.setdefault("DJANGO_SETTINGS_MODULE", args.settings or "ietf.settings_testcrawl") import django import django.test import django.core.checks from django.conf import settings from django.utils import timezone django.setup() # This needs to come after we set up sys path to include the local django import debug # pyflakes:ignore # prevent memory from leaking when settings.DEBUG=True # from django.db import connection # class DontSaveQueries(list): # def append(self, x): # pass # connection.queries = DontSaveQueries() from ietf.name.models import DocTypeName from ietf.utils.html import unescape from ietf.utils.test_utils import unicontent from ietf.utils.test_runner import start_vnu_server, vnu_validate, vnu_fmt_message, vnu_filter_message # --- Constants --- MAX_URL_LENGTH = 500 # --- Functions --- def note(s): if args.verbose: sys.stderr.write(s) sys.stderr.write('\n') def strip_url(url): if url.startswith("http://testserver"): url = url[len("http://testserver"):] fragment_url = re.search(r"^(.+)#[a-z_.-]+$", url) if fragment_url: url = fragment_url.group(1) next_url = re.search(r"^(.+)\?next=.+$", url) if next_url: url = next_url.group(1) return url def extract_html_urls(content): for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content): if re.search(r'rel=["\']?nofollow["\']', m.group(1)): continue url = strip_url(m.group(2)) if len(url) > MAX_URL_LENGTH: continue # avoid infinite GET parameter appendages if not url.startswith("/"): continue if url.startswith("//"): continue yield unescape(url) def extract_tastypie_urls(content): VISIT_OBJECTS = False VISIT_NEXT = False data = json.loads(content) for item in data: if type(data[item]) is dict: if "list_endpoint" in data[item]: uri = data[item]["list_endpoint"] yield uri if VISIT_NEXT: if "meta" in data and "next" in data["meta"]: uri = data["meta"]["next"] if uri != None: yield uri if VISIT_OBJECTS: if "objects" in data: object_list = data["objects"] for i in range(len(object_list)): if "resource_uri" in object_list[i]: uri = object_list[i]["resource_uri"] yield uri def check_html_valid(url, response, args): if args.skip_html_validation: return global parser, validated_urls, doc_types, warnings key = url if not args.validate_all: # derive a key for urls like this by replacing primary keys key = re.sub("#.*$", "", key) key = re.sub("/.+@.+/", "/x@x.org/", key) key = re.sub("/[0-9.]+/", "/mmmm/", key) key = re.sub("/[0-9.]+/", "/nnnn/", key) key = re.sub("/ag/[a-z0-9-]+/", "/ag/foo/", key) key = re.sub("/area/[a-z0-9-]+/", "/area/foo/", key) key = re.sub("/bcp[0-9]+/", "/bcpnnn/", key) key = re.sub("/conflict-review-[a-z0-9-]+/", "/conflrev-foo/", key) key = re.sub("/dir/[a-z0-9-]+/", "/dir/foo/", key) key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key) key = re.sub("/group/[a-z0-9-]+/", "/group/foo/", key) key = re.sub("/html/[a-z0-9-]+", "/html/foo/", key) key = re.sub("/ipr/search/.*", "/ipr/search/", key) key = re.sub("/meeting/[-0-9a-z]+/agenda/[0-9a-z]+/", "/meeting/nn/agenda/foo/", key) key = re.sub("/release/[0-9dev.]+/", "/release/n.n.n/", key) key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key) key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key) key = re.sub("/secr/srec/nnnn/[0-9a-z-]+/", "/secr/sreq/nn/bar/", key) key = re.sub("/state/[a-z0-9-]+/", "/state/foo/", key) key = re.sub("/state/[a-z0-9-]+/[a-z0-9-]+/", "/state/foo/bar/", key) key = re.sub("/status-change-[a-z0-9-]+/", "/statchg-foo/", key) key = re.sub("/std[0-9]+/", "/stdnnn/", key) key = re.sub("/submit/status/nnnn/[0-9a-f]+/", "/submit/status/nnnn/bar/", key) key = re.sub("/team/[a-z0-9-]+/", "/team/foo/", key) key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key) key = re.sub(r"\?.*$", "", key) for slug in doc_types: key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key) if not key in validated_urls: note('Validate: %-32s: %s' % (url[:32], key)) if hasattr(response, "content"): content = response.content else: content = response.streaming_content validated_urls[key] = True if args.validator_nu: ret = vnu_validate(content, response["Content-Type"], port=8887) assert ret for m in json.loads(ret)["messages"]: if "lastLine" not in m: tag = m["message"] else: tag = vnu_fmt_message(url, m, content.decode()) # disregard some HTML issues that are (usually) due to invalid # database content if not vnu_filter_message(m, True, False): warnings += 1 tags.append(tag) else: try: parser.parse(content) except Exception as e: for err in parser.errors: pos, _, _ = err tags.append("WARN invalid html at line, pos {}: {}".format(pos, e)) warnings += 1 def skip_extract_from(url): for pattern in ( r'^/doc/html/[a-z0-9-]+', r'^/meeting/[a-z0-9-]+/agenda/[a-z0-9-]+', r'^/static/coverage/', ): if re.search(pattern, url): return True return False def skip_url(url): for pattern in skip_patterns + args.exclude: if pattern.search(url): return True return False def log(s): print(s) if logfile: if not type(s) is str: s = s.encode('utf-8') logfile.write(s) logfile.write('\n') def get_referrers(url): ref_list = [] while url in referrers: url = referrers[url] if url in ref_list: log("Circular referral list, discovered at %s" % url) break ref_list.append(url) return ref_list nowtime = None nowstrn = None def nowstr(): global nowtime, nowstrn t = time.time_ns()/(10**9) if nowtime != t: nowtime = t nowstrn = timezone.now().strftime('%H:%M:%S').encode() return nowstrn b_exthost = re.sub(b'https?', b'', args.exthost.encode()) if args.exthost else None def normalize_for_diff(page): # pages containing 'current time' can differ if they're fetched on different seconds: #page = page.replace(nowstr(), b'00:00:00') page = page.replace(b'https://', b'http://') # regex replacements page = re.sub(b'', b'', page) page = re.sub(b' -- (Test|Development) Mode', b'', page) page = re.sub(b'\n\s*\n+\s*', b'\n', page) page = re.sub(b'name="csrfmiddlewaretoken" value="\w+"', b'', page) page = re.sub(b'urn:uuid:[0-9a-f-]+', b'urn:uuid:00000000-0000-0000-0000-000000000000', page) page = re.sub(b'[0-9T.:+-]+', b'', page) page = re.sub(b'[0-9T.:+-]+', b'', page) if b_exthost: page = re.sub(b_exthost, b'://testserver/', page) return page def get_differences(a, b): #open('a.html','wb').write(a) #open('b.html','wb').write(b) a = a.decode().splitlines() b = b.decode().splitlines() for group in difflib.SequenceMatcher(None,a,b).get_grouped_opcodes(): for tag, i1, i2, j1, j2 in group: if tag == 'equal': pass elif tag == 'replace': # see if the lines have next-day date matches if i2-i1 == j2-j1: matches = [] for i in range(i2-i1): aline = a[i1+i] bline = b[j1+i] if len(aline) == len(bline): adates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', aline)) bdates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', bline)) # See if all date matches are in the same places if len(adates) and [ match.start() for match in adates ] == [ match.start() for match in bdates ]: # try to transform a into b by date shifting adates = [ match[0] for match in adates ] bdates = [ match[0] for match in bdates ] for i in range(len(adates)): if adates[i] != bdates[i]: d = datetime.datetime.strptime(adates[i], '%Y-%m-%d').date() # shift date d += datetime.timedelta(days=1) adates[i] = d.strftime('%Y-%m-%d') matches.append(adates == bdates) else: matches = [ False ] if not all(matches): for line in a[i1:i2]: yield '-' + line for line in b[j1:j2]: yield '+' + line elif tag == 'delete': for line in a[i1:i2]: yield '-' + line elif tag == 'insert': for line in b[j1:j2]: yield '+' + line # --- GLobals --- slow_threshold = args.slow_threshold visited_fn = 'visited.json' visited = set() if args.no_revisit: if os.path.exists(visited_fn): with open(visited_fn, "r") as f: visited = set(json.load(f)) else: if os.path.exists(visited_fn): os.unlink(visited_fn) urls_fn = 'urls.json' urls = {} # url -> referrer if args.no_revisit: if os.path.exists(urls_fn): with open(urls_fn, "r") as f: urls = json.load(f) else: if os.path.exists(urls_fn): os.unlink(urls_fn) referrers = {} initial_urls = [] initial_urls.extend(args.urls) if args.url_file: with open(args.url_file) as f: for line in f: line = line.partition("#")[0].strip() if line: initial_urls.append(line) if not initial_urls: initial_urls.append("/") initial_urls.append("/api/v1") for url in initial_urls: urls[url] = "[initial]" if args.exclude_from: with open(args.exclude_from) as f: args.exclude += [ l.strip() for l in f.readlines() ] args.exclude = [ re.compile(p) for p in args.exclude if p ] # pre-set exclusion patterns skip_patterns = [ r"^/community/[0-9]+/remove_document/", r"^/community/personal/", # Skip most of the slow pdf composite generation urls and svg urls r"^/meeting/[0-9]+/agenda/[0-9b-z].*-drafts\\.pdf", r"^/wg/[a-z0-9-]+/deps/svg/", # This bad url occurs in an uploaded html agenda: r"/site/ietfdhcwg/_/rsrc/1311005436000/system/app/css/overlay.css\?cb=simple100%250150goog-ws-left", r"/dir/tsvdir/reviews/", r"draft-touch-msword-template-v2\.0", # There is a long list of urls that will always 404, but we include only those not excluded above r"^/doc/html/draft-balakrishnan-cm-03", r"^/doc/html/draft-ballardie-cbt-02", # r"^/doc/html/draft-[0-9ac-z]", r"^/doc/html/draft-b[0-9b-z]", r"^/doc/pdf/draft-[0-9ac-z]", r"^/doc/pdf/draft-b[0-9b-z]", r"^/doc/html/charter-.*", r"^/doc/html/status-.*", r"^/doc/html/rfc.*", r"^/static/coverage/", r"^/meeting/\d{,2}/agenda", # no agendas < 100 ] skip_patterns = [ re.compile(p) for p in skip_patterns ] parser = html5lib.HTMLParser(strict=True) # initialise validated_urls with some patterns we don't want to check, # because they aren't under our control, such as uploaded group agendas. validated_urls = {'/meeting/nn/agenda/foo/': True, } doc_types = [ t.slug for t in DocTypeName.objects.all() ] errors = 0 warnings = 0 count = 0 start_time = datetime.datetime.now() client = django.test.Client(Accept='text/html,text/plain,application/json') logfile = None if args.logfile: logfile = open(args.logfile, "w") if args.exthost: curdir = './cur/' os.makedirs(curdir, exist_ok=True) extdir = './ext/' os.makedirs(extdir, exist_ok=True) vnu = None # --- Main --- def do_exit(code): if vnu: vnu.terminate() sys.exit(code) if __name__ == "__main__": if (args.user): # log in as user, to have the respective HTML generated by the templates response = client.post('/accounts/login/', {'username': args.user, 'password': 'password'}, secure=True, follow=True) if (response.status_code != 200): log("Could not log in as %s, HTML response %d" % (args.user, response.status_code)) do_exit(1) # Run django system checks and checks from ietf.checks: error_list = django.core.checks.run_checks() silenced = [] for i in range(len(error_list)): if error_list[i].id in settings.SILENCED_SYSTEM_CHECKS: silenced.append(i) silenced.sort(reverse=True) for i in silenced: del error_list[i] if error_list: print("") for entry in error_list: print(entry) if args.validator_nu: vnu = start_vnu_server(port=8887) try: while urls: if args.random: url = random.choice(list(urls.keys())) referrer = urls.pop(url) else: url, referrer = urls.popitem() visited.add(url) if (len(visited) % 100) == 1: if args.exthost: # Check that we have the same dump on both sides exturl = urllib.parse.urljoin(args.exthost, 'api/version') extres = requests.get(exturl) extdumptime = extres.json()['dumptime'] intres = client.get('/api/version') intdumptime = intres.json()['dumptime'] if extdumptime != intdumptime: sys.stderr.write("Was trying to diff output from different dumps:\n" f" External site dump time: {extdumptime}\n" f" Internal site dump time: {intdumptime}\n") sys.exit(2) log("\nElapsed Visited Queue Code Time Url ... Notes") if (len(visited) % 1000) == 0: with open(visited_fn, "w") as f: json.dump(list(visited), f, indent=1) with open(urls_fn, "w") as f: json.dump(urls, f, indent=1) if skip_url(url): continue timestamp = datetime.datetime.now() acc_time = (timestamp - start_time).total_seconds() acc_secs = (timestamp - start_time).total_seconds() hrs = acc_secs // (60*60) min = (acc_secs % (60*60)) // 60 sec = acc_secs % 60 try: request_start = datetime.datetime.now() if args.verbose: sys.stderr.write(url+'\n') r = client.get(url, secure=True) elapsed = datetime.datetime.now() - request_start except KeyboardInterrupt: log(" ... was fetching %s" % url) visited.remove(url) do_exit(1) except: elapsed = datetime.datetime.now() - request_start tags = [ "FAIL (from [ %s ])" % (",\n\t".join(get_referrers(url))) ] log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), 500, elapsed.total_seconds(), url, " ".join(tags))) log("=============") log(traceback.format_exc()) log("=============") errors += 1 else: tags = [] ctype = r["Content-Type"] if r.status_code in (301, 302): u = strip_url(r["Location"]) if not url.startswith("/") and u not in visited and u not in urls: urls[u] = referrer # referrer is original referrer, not redirected url referrers[u] = referrer ctype = '' elif r.status_code == 200: ctype = r["Content-Type"] if ";" in ctype: ctype = ctype[:ctype.index(";")] if ctype == "text/html": try: if args.follow and not skip_extract_from(url): for u in extract_html_urls(unicontent(r)): if u not in visited and u not in urls: urls[u] = url referrers[u] = url check_html_valid(url, r, args) except: log("error extracting HTML urls from %s" % url) log("=============") log(traceback.format_exc()) log("=============") elif ctype == "application/json" and url.startswith('/api/v1/'): try: if args.follow: for u in extract_tastypie_urls(unicontent(r)): if u not in visited and u not in urls: urls[u] = url referrers[u] = url except: log("error extracting urls from %s" % url) log("=============") log(traceback.format_exc()) log("=============") if args.exthost and ctype not in ('application/json', 'application/pdf', 'application/x-gtar', 'application/octet-stream', ): urlpath = urllib.parse.urljoin(args.exthost, url) try: x = requests.get(urlpath) if hasattr(r, 'content') and hasattr(x, 'content'): # Remove comments (which can contain template paths and # version strings, and do some newline normalization: cur = normalize_for_diff(r.content) ext = normalize_for_diff(x.content) # if cur != ext: try: diff = list(get_differences(ext, cur)) except Exception as e: log(f"Error computing diff for {url} ({ctype}):\n {e}") sys.exit(1) if diff: fn = url.strip('/').replace('/', '_') or 'root' with open(curdir+fn, 'wb') as f: f.write(cur) with open(extdir+fn, 'wb') as f: f.write(ext) tags.append(f"DIFF ({len(diff)} lines)") if args.failfast: sys.stderr.write('\n'.join(diff)+'\n') sys.exit(1) except requests.exceptions.ConnectionError as e: sys.exit(e) else: tags.append("FAIL (from {})".format(referrer)) if not url.startswith("/person/"): # FIXME: those fail sometimes errors += 1 if elapsed.total_seconds() > slow_threshold: tags.append("SLOW") acc_time = (timestamp - start_time).total_seconds() acc_secs = (timestamp - start_time).total_seconds() hrs = acc_secs // (60*60) min = (acc_secs % (60*60)) // 60 sec = acc_secs % 60 log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags))) if ((errors or warnings) and args.pedantic): log(f"Errors : {errors}") log(f"Warnings: {warnings}") do_exit(1) if logfile: logfile.close() sys.stderr.write("Output written to %s\n\n" % logfile.name) if errors > 0: sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors) do_exit(1) else: sys.stderr.write("Found no errors.\n") if warnings > 0: sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings) else: sys.stderr.write("Found no warnings.\n") finally: if args.no_revisit: sys.stderr.write("Saving list of visited URLs\n") with open(visited_fn, "w") as f: json.dump(list(visited), f, indent=1) with open(urls_fn, "w") as f: json.dump(urls, f, indent=1)