feat: test-crawl with diff (#4142)

* feat: move differencing test-crawl forward from tzaware-obe.

* fix: port html validation changes from main into test-crawl

* fix: address review comments
This commit is contained in:
Robert Sparks 2022-07-08 16:39:58 -05:00 committed by GitHub
parent 629bff0f88
commit 57818a0131
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,11 +1,24 @@
#!/usr/bin/env python
# -*- indent-with-tabs: 0 -*-
# Copyright The IETF Trust 2013-2019, All Rights Reserved
# Copyright The IETF Trust 2013-2022, All Rights Reserved
import os, sys, re, datetime, argparse, traceback, json
import argparse
import datetime
import difflib
import html5lib
import json
import os
import random
import re
import requests
import sys
import time
import traceback
import urllib.parse
import warnings
warnings.filterwarnings("ignore", message=r"group\.HistoricalGroupFeatures\.\w+ failed to load invalid json")
# Set up import path to find our own Django
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@ -19,26 +32,33 @@ parser = argparse.ArgumentParser(
printed - in case of errors, a stacktrace is also included.""")
parser.add_argument('urls', metavar='URL', nargs='*',
help='One or more URLs to start the crawl from')
parser.add_argument('--urls', '-u', dest='url_file',
help='File with URLs to start the crawl from')
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='Responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', help='Custom settings file')
parser.add_argument('--diff', dest='exthost', type=str, metavar="SITE", help='Diff pages against external site')
parser.add_argument('--failfast', action='store_true',
help='Stop the crawl on the first page failure')
parser.add_argument('--logfile', help='Write to logfile')
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
parser.add_argument('--no-follow', dest='follow', action='store_false', default=True,
help='Do not follow URLs found in fetched pages, just check the given URLs')
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
parser.add_argument('--pedantic', action='store_true',
help='Stop the crawl on the first error or warning')
parser.add_argument('--random', action='store_true',
help='Crawl URLs randomly')
parser.add_argument('-R', '--no-revisit', action='store_true', default=False, help="Don't revisit already visited URLs")
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='Responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', help='Custom settings file')
parser.add_argument('--urls', '-u', dest='url_file',
help='File with URLs to start the crawl from')
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
help='Run html 5 validation on all pages, without skipping similar urls. '
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='Be more verbose')
parser.add_argument('-x', '--exclude', action='append', default=[], help="Exclude URLs matching pattern")
parser.add_argument('-X', '--exclude-from', metavar='FILE', help="URL exclusion pattern file")
args = parser.parse_args()
@ -49,6 +69,7 @@ import django
import django.test
import django.core.checks
from django.conf import settings
from django.utils import timezone
django.setup()
@ -81,7 +102,7 @@ def note(s):
def strip_url(url):
if url.startswith("http://testserver"):
url = url[len("http://testserver"):]
fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
fragment_url = re.search(r"^(.+)#[a-z_.-]+$", url)
if fragment_url:
url = fragment_url.group(1)
next_url = re.search(r"^(.+)\?next=.+$", url)
@ -202,27 +223,8 @@ def skip_extract_from(url):
return False
def skip_url(url):
for pattern in (
r"^/community/[0-9]+/remove_document/",
r"^/community/personal/",
# Skip most of the slow pdf composite generation urls and svg urls
r"^/meeting/[0-9]+/agenda/[0-9b-z].*-drafts\\.pdf",
r"^/wg/[a-z0-9-]+/deps/svg/",
# Skip other bad urls
r"^/dir/tsvdir/reviews/",
# r"^/ipr/\d{,3}/history/",
# Skip most html conversions, not worth the time
r"^/doc/html/draft-[0-9ac-z]",
r"^/doc/html/draft-b[0-9b-z]",
r"^/doc/pdf/draft-[0-9ac-z]",
r"^/doc/pdf/draft-b[0-9b-z]",
r"^/doc/html/charter-.*",
r"^/doc/html/status-.*",
r"^/doc/html/rfc.*",
r"^/static/coverage/",
r"^/meeting/\d{,2}/agenda", # no agendas < 100
):
if re.search(pattern, url):
for pattern in skip_patterns + args.exclude:
if pattern.search(url):
return True
return False
@ -244,12 +246,105 @@ def get_referrers(url):
ref_list.append(url)
return ref_list
nowtime = None
nowstrn = None
def nowstr():
global nowtime, nowstrn
t = time.time_ns()/(10**9)
if nowtime != t:
nowtime = t
nowstrn = timezone.now().strftime('%H:%M:%S').encode()
return nowstrn
b_exthost = re.sub(b'https?', b'', args.exthost.encode()) if args.exthost else None
def normalize_for_diff(page):
# pages containing 'current time' can differ if they're fetched on different seconds:
#page = page.replace(nowstr(), b'00:00:00')
page = page.replace(b'https://', b'http://')
# regex replacements
page = re.sub(b'<!--.*?-->', b'', page)
page = re.sub(b' -- (Test|Development) Mode', b'', page)
page = re.sub(b'\n\s*\n+\s*', b'\n', page)
page = re.sub(b'name="csrfmiddlewaretoken" value="\w+"', b'', page)
page = re.sub(b'urn:uuid:[0-9a-f-]+', b'urn:uuid:00000000-0000-0000-0000-000000000000', page)
page = re.sub(b'<updated>[0-9T.:+-]+</updated>', b'', page)
page = re.sub(b'<published>[0-9T.:+-]+</published>', b'', page)
if b_exthost:
page = re.sub(b_exthost, b'://testserver/', page)
return page
def get_differences(a, b):
#open('a.html','wb').write(a)
#open('b.html','wb').write(b)
a = a.decode().splitlines()
b = b.decode().splitlines()
for group in difflib.SequenceMatcher(None,a,b).get_grouped_opcodes():
for tag, i1, i2, j1, j2 in group:
if tag == 'equal':
pass
elif tag == 'replace':
# see if the lines have next-day date matches
if i2-i1 == j2-j1:
matches = []
for i in range(i2-i1):
aline = a[i1+i]
bline = b[j1+i]
if len(aline) == len(bline):
adates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', aline))
bdates = list(re.finditer(r'\d\d\d\d-\d\d-\d\d', bline))
# See if all date matches are in the same places
if len(adates) and [ match.start() for match in adates ] == [ match.start() for match in bdates ]:
# try to transform a into b by date shifting
adates = [ match[0] for match in adates ]
bdates = [ match[0] for match in bdates ]
for i in range(len(adates)):
if adates[i] != bdates[i]:
d = datetime.datetime.strptime(adates[i], '%Y-%m-%d').date()
# shift date
d += datetime.timedelta(days=1)
adates[i] = d.strftime('%Y-%m-%d')
matches.append(adates == bdates)
else:
matches = [ False ]
if not all(matches):
for line in a[i1:i2]:
yield '-' + line
for line in b[j1:j2]:
yield '+' + line
elif tag == 'delete':
for line in a[i1:i2]:
yield '-' + line
elif tag == 'insert':
for line in b[j1:j2]:
yield '+' + line
# --- GLobals ---
slow_threshold = args.slow_threshold
visited_fn = 'visited.json'
visited = set()
if args.no_revisit:
if os.path.exists(visited_fn):
with open(visited_fn, "r") as f:
visited = set(json.load(f))
else:
if os.path.exists(visited_fn):
os.unlink(visited_fn)
urls_fn = 'urls.json'
urls = {} # url -> referrer
if args.no_revisit:
if os.path.exists(urls_fn):
with open(urls_fn, "r") as f:
urls = json.load(f)
else:
if os.path.exists(urls_fn):
os.unlink(urls_fn)
referrers = {}
initial_urls = []
@ -269,6 +364,40 @@ if not initial_urls:
for url in initial_urls:
urls[url] = "[initial]"
if args.exclude_from:
with open(args.exclude_from) as f:
args.exclude += [ l.strip() for l in f.readlines() ]
args.exclude = [ re.compile(p) for p in args.exclude if p ]
# pre-set exclusion patterns
skip_patterns = [
r"^/community/[0-9]+/remove_document/",
r"^/community/personal/",
# Skip most of the slow pdf composite generation urls and svg urls
r"^/meeting/[0-9]+/agenda/[0-9b-z].*-drafts\\.pdf",
r"^/wg/[a-z0-9-]+/deps/svg/",
# This bad url occurs in an uploaded html agenda:
r"/site/ietfdhcwg/_/rsrc/1311005436000/system/app/css/overlay.css\?cb=simple100%250150goog-ws-left",
r"/dir/tsvdir/reviews/",
r"draft-touch-msword-template-v2\.0",
# There is a long list of urls that will always 404, but we include only those not excluded above
r"^/doc/html/draft-balakrishnan-cm-03",
r"^/doc/html/draft-ballardie-cbt-02",
#
r"^/doc/html/draft-[0-9ac-z]",
r"^/doc/html/draft-b[0-9b-z]",
r"^/doc/pdf/draft-[0-9ac-z]",
r"^/doc/pdf/draft-b[0-9b-z]",
r"^/doc/html/charter-.*",
r"^/doc/html/status-.*",
r"^/doc/html/rfc.*",
r"^/static/coverage/",
r"^/meeting/\d{,2}/agenda", # no agendas < 100
]
skip_patterns = [ re.compile(p) for p in skip_patterns ]
parser = html5lib.HTMLParser(strict=True)
# initialise validated_urls with some patterns we don't want to check,
@ -289,6 +418,12 @@ logfile = None
if args.logfile:
logfile = open(args.logfile, "w")
if args.exthost:
curdir = './cur/'
os.makedirs(curdir, exist_ok=True)
extdir = './ext/'
os.makedirs(extdir, exist_ok=True)
vnu = None
# --- Main ---
@ -326,118 +461,175 @@ if __name__ == "__main__":
if args.validator_nu:
vnu = start_vnu_server(port=8887)
while urls:
if args.random:
# popitem() is documented to be random, but really isn't
url = random.choice(list(urls.keys()))
referrer = urls.pop(url)
else:
url, referrer = urls.popitem()
visited.add(url)
if skip_url(url):
continue
timestamp = datetime.datetime.now()
acc_time = (timestamp - start_time).total_seconds()
acc_secs = (timestamp - start_time).total_seconds()
hrs = acc_secs // (60*60)
min = (acc_secs % (60*60)) // 60
sec = acc_secs % 60
try:
request_start = datetime.datetime.now()
if args.verbose:
sys.stderr.write(url+'\n')
r = client.get(url, secure=True)
elapsed = datetime.datetime.now() - request_start
except KeyboardInterrupt:
log(" ... was fetching %s" % url)
do_exit(1)
except:
elapsed = datetime.datetime.now() - request_start
tags = [ "FAIL (from [ %s ])" % (",\n\t".join(get_referrers(url))) ]
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), 500, elapsed.total_seconds(), url, " ".join(tags)))
log("=============")
log(traceback.format_exc())
log("=============")
errors += 1
else:
tags = []
if r.status_code in (301, 302):
u = strip_url(r["Location"])
if not url.startswith("/") and u not in visited and u not in urls:
urls[u] = referrer # referrer is original referrer, not redirected url
referrers[u] = referrer
elif r.status_code == 200:
ctype = r["Content-Type"]
if ";" in ctype:
ctype = ctype[:ctype.index(";")]
if ctype == "text/html":
try:
if args.follow and not skip_extract_from(url):
for u in extract_html_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
check_html_valid(url, r, args)
except:
log("error extracting HTML urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
elif ctype == "application/json":
try:
if args.follow:
for u in extract_tastypie_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
except:
log("error extracting urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
try:
while urls:
if args.random:
url = random.choice(list(urls.keys()))
referrer = urls.pop(url)
else:
tags.append("FAIL (from {})".format(referrer))
if not url.startswith("/person/"): # FIXME: those fail sometimes
errors += 1
url, referrer = urls.popitem()
if elapsed.total_seconds() > slow_threshold:
tags.append("SLOW")
visited.add(url)
if (len(visited) % 100) == 1:
if args.exthost:
# Check that we have the same dump on both sides
exturl = urllib.parse.urljoin(args.exthost, 'api/version')
extres = requests.get(exturl)
extdumptime = extres.json()['dumptime']
intres = client.get('/api/version')
intdumptime = intres.json()['dumptime']
if extdumptime != intdumptime:
sys.stderr.write("Was trying to diff output from different dumps:\n"
f" External site dump time: {extdumptime}\n"
f" Internal site dump time: {intdumptime}\n")
sys.exit(2)
log("\nElapsed Visited Queue Code Time Url ... Notes")
if (len(visited) % 1000) == 0:
with open(visited_fn, "w") as f:
json.dump(list(visited), f, indent=1)
with open(urls_fn, "w") as f:
json.dump(urls, f, indent=1)
if skip_url(url):
continue
timestamp = datetime.datetime.now()
acc_time = (timestamp - start_time).total_seconds()
acc_secs = (timestamp - start_time).total_seconds()
hrs = acc_secs // (60*60)
min = (acc_secs % (60*60)) // 60
sec = acc_secs % 60
if (len(visited) % 100) == 1:
log("\nElapsed Visited Queue Code Time Url ... Notes")
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
if ((errors or warnings) and args.pedantic):
try:
request_start = datetime.datetime.now()
if args.verbose:
sys.stderr.write(url+'\n')
r = client.get(url, secure=True)
elapsed = datetime.datetime.now() - request_start
except KeyboardInterrupt:
log(" ... was fetching %s" % url)
visited.remove(url)
do_exit(1)
except:
elapsed = datetime.datetime.now() - request_start
tags = [ "FAIL (from [ %s ])" % (",\n\t".join(get_referrers(url))) ]
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), 500, elapsed.total_seconds(), url, " ".join(tags)))
log("=============")
log(traceback.format_exc())
log("=============")
errors += 1
else:
tags = []
if logfile:
logfile.close()
sys.stderr.write("Output written to %s\n\n" % logfile.name)
ctype = r["Content-Type"]
if r.status_code in (301, 302):
u = strip_url(r["Location"])
if not url.startswith("/") and u not in visited and u not in urls:
urls[u] = referrer # referrer is original referrer, not redirected url
referrers[u] = referrer
ctype = ''
elif r.status_code == 200:
ctype = r["Content-Type"]
if ";" in ctype:
ctype = ctype[:ctype.index(";")]
if errors > 0:
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
do_exit(1)
else:
sys.stderr.write("Found no errors.\n")
if warnings > 0:
sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
else:
sys.stderr.write("Found no warnings.\n")
if ctype == "text/html":
try:
if args.follow and not skip_extract_from(url):
for u in extract_html_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
check_html_valid(url, r, args)
except:
log("error extracting HTML urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
elif ctype == "application/json" and url.startswith('/api/v1/'):
try:
if args.follow:
for u in extract_tastypie_urls(unicontent(r)):
if u not in visited and u not in urls:
urls[u] = url
referrers[u] = url
except:
log("error extracting urls from %s" % url)
log("=============")
log(traceback.format_exc())
log("=============")
if args.exthost and ctype not in ('application/json', 'application/pdf',
'application/x-gtar', 'application/octet-stream', ):
urlpath = urllib.parse.urljoin(args.exthost, url)
try:
x = requests.get(urlpath)
if hasattr(r, 'content') and hasattr(x, 'content'):
# Remove comments (which can contain template paths and
# version strings, and do some newline normalization:
cur = normalize_for_diff(r.content)
ext = normalize_for_diff(x.content)
#
if cur != ext:
try:
diff = list(get_differences(ext, cur))
except Exception as e:
log(f"Error computing diff for {url} ({ctype}):\n {e}")
sys.exit(1)
if diff:
fn = url.strip('/').replace('/', '_') or 'root'
with open(curdir+fn, 'wb') as f:
f.write(cur)
with open(extdir+fn, 'wb') as f:
f.write(ext)
tags.append(f"DIFF ({len(diff)} lines)")
if args.failfast:
sys.stderr.write('\n'.join(diff)+'\n')
sys.exit(1)
except requests.exceptions.ConnectionError as e:
sys.exit(e)
else:
tags.append("FAIL (from {})".format(referrer))
if not url.startswith("/person/"): # FIXME: those fail sometimes
errors += 1
if elapsed.total_seconds() > slow_threshold:
tags.append("SLOW")
acc_time = (timestamp - start_time).total_seconds()
acc_secs = (timestamp - start_time).total_seconds()
hrs = acc_secs // (60*60)
min = (acc_secs % (60*60)) // 60
sec = acc_secs % 60
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
if ((errors or warnings) and args.pedantic):
log(f"Errors : {errors}")
log(f"Warnings: {warnings}")
do_exit(1)
if logfile:
logfile.close()
sys.stderr.write("Output written to %s\n\n" % logfile.name)
if errors > 0:
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
do_exit(1)
else:
sys.stderr.write("Found no errors.\n")
if warnings > 0:
sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
else:
sys.stderr.write("Found no warnings.\n")
finally:
if args.no_revisit:
sys.stderr.write("Saving list of visited URLs\n")
with open(visited_fn, "w") as f:
json.dump(list(visited), f, indent=1)
with open(urls_fn, "w") as f:
json.dump(urls, f, indent=1)