Merged in [9765] from lars@netapp.com:

Add option to crawl as a logged-in user (--user).
Add --pedantic option for vnu crawl, which stops the crawl on (most) errors.
Randomize the order in which URLs are crawled, so that repeated crawls don't
hit the same URLs in the same order.
 - Legacy-Id: 9785
Note: SVN reference [9765] has been migrated to Git commit 9b4e61049a704127e1200549fcc410326efffddb
This commit is contained in:
Henrik Levkowetz 2015-07-18 12:00:37 +00:00
parent 23bcde63dc
commit 8612ce92c0

View file

@ -3,6 +3,7 @@
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
import html5lib
import debug # pyflakes:ignore
import random
# Set up import path to find our own Django
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@ -17,13 +18,19 @@ parser = argparse.ArgumentParser(
parser.add_argument('urls', metavar='URL', nargs='*',
help='One or more URLs to start the crawl from')
parser.add_argument('--urls', '-u', dest='url_file',
help='file with URLs to start the crawl from')
help='File with URLs to start the crawl from')
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', dest='settings', help='custom settings file')
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
help='Responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', help='Custom settings file')
parser.add_argument('--logfile', help='Write to logfile')
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
parser.add_argument('--pedantic', action='store_true',
help='Stop the crawl on the first HTML validation issue')
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
help='Run html 5 validation on all pages, without skipping similar urls. '
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
args = parser.parse_args()
@ -96,17 +103,25 @@ def extract_tastypie_urls(content):
def check_html_valid(url, response, args):
global parser, validated_urls, doc_types, warnings
# derive a key for urls like this by replacing primary keys
# These URLs have known issues, skip them until those are fixed
if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
log("%s blacklisted; skipping HTML validation" % url)
return
key = url
key = re.sub("/[0-9.]+/", "/nnnn/", key)
key = re.sub("/.+@.+/", "/x@x.org/", key)
key = re.sub("#.*$", "", key)
key = re.sub("\?.*$", "", key)
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
for slug in doc_types:
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
if not args.validate_all:
# derive a key for urls like this by replacing primary keys
key = re.sub("/[0-9.]+/", "/nnnn/", key)
key = re.sub("/.+@.+/", "/x@x.org/", key)
key = re.sub("#.*$", "", key)
key = re.sub("\?.*$", "", key)
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
key = re.sub("/ipr/[0-9]+/", "/ipr/nnnn/", key)
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
for slug in doc_types:
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
if not key in validated_urls:
if hasattr(response, "content"):
content = response.content
@ -124,7 +139,10 @@ def check_html_valid(url, response, args):
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
tags.append("\n\t%s%s" %
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
warnings += 1
# disregard some HTML issues that are (usually) due to invalid
# database content
if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
warnings += 1
else:
try:
parser.parse(content)
@ -140,6 +158,8 @@ def check_html_valid(url, response, args):
def log(s):
print(s)
if logfile:
if not type(s) is str:
s = s.encode('utf-8')
logfile.write(s)
logfile.write('\n')
@ -157,6 +177,8 @@ def get_referrers(url):
slow_threshold = args.slow_threshold
visited = set()
urls = {} # url -> referrer
referrers = {}
@ -196,20 +218,29 @@ logfile = None
if args.logfile:
logfile = open(args.logfile, "w")
validated_urls = {}
# --- Main ---
if __name__ == "__main__":
if (args.user):
# log in as user, to have the respective HTML generated by the templates
response = client.post('/accounts/login/',
{'username': args.user, 'password': 'password'},
secure=True, follow=True)
if (response.status_code != 200):
log("Could not log in as %s, HTML response %d" %
(args.user, response.status_code))
sys.exit(1)
while urls:
url, referrer = urls.popitem()
# popitem() is documented to be random, but really isn't
url = random.choice(urls.keys())
referrer = urls.pop(url)
visited.add(url)
try:
timestamp = datetime.datetime.now()
r = client.get(url)
r = client.get(url, secure=True, follow=True)
elapsed = datetime.datetime.now() - timestamp
except KeyboardInterrupt:
log(" ... was fetching %s" % url)
@ -278,6 +309,8 @@ if __name__ == "__main__":
log("\nElapsed Visited Queue Code Time Url ... Notes")
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
if ((errors or warnings) and args.pedantic):
sys.exit(1)
if logfile:
logfile.close()