Merged in [9765] from lars@netapp.com:
Add option to crawl as a logged-in user (--user). Add --pedantic option for vnu crawl, which stops the crawl on (most) errors. Randomize the order in which URLs are crawled, so that repeated crawls don't hit the same URLs in the same order. - Legacy-Id: 9785 Note: SVN reference [9765] has been migrated to Git commit 9b4e61049a704127e1200549fcc410326efffddb
This commit is contained in:
parent
23bcde63dc
commit
8612ce92c0
|
@ -3,6 +3,7 @@
|
|||
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
|
||||
import html5lib
|
||||
import debug # pyflakes:ignore
|
||||
import random
|
||||
|
||||
# Set up import path to find our own Django
|
||||
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
|
||||
|
@ -17,13 +18,19 @@ parser = argparse.ArgumentParser(
|
|||
parser.add_argument('urls', metavar='URL', nargs='*',
|
||||
help='One or more URLs to start the crawl from')
|
||||
parser.add_argument('--urls', '-u', dest='url_file',
|
||||
help='file with URLs to start the crawl from')
|
||||
help='File with URLs to start the crawl from')
|
||||
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
|
||||
help='responses taking longer than this (in seconds) results in SLOW being printed')
|
||||
parser.add_argument('--settings', dest='settings', help='custom settings file')
|
||||
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
|
||||
help='Responses taking longer than this (in seconds) results in SLOW being printed')
|
||||
parser.add_argument('--settings', help='Custom settings file')
|
||||
parser.add_argument('--logfile', help='Write to logfile')
|
||||
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
|
||||
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
|
||||
help='Use validator.nu instead of html5lib for HTML validation')
|
||||
parser.add_argument('--pedantic', action='store_true',
|
||||
help='Stop the crawl on the first HTML validation issue')
|
||||
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
|
||||
help='Run html 5 validation on all pages, without skipping similar urls. '
|
||||
'(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -96,17 +103,25 @@ def extract_tastypie_urls(content):
|
|||
|
||||
def check_html_valid(url, response, args):
|
||||
global parser, validated_urls, doc_types, warnings
|
||||
# derive a key for urls like this by replacing primary keys
|
||||
# These URLs have known issues, skip them until those are fixed
|
||||
if re.search('(/secr|admin/)|/doc/.*/edit/info/', url):
|
||||
log("%s blacklisted; skipping HTML validation" % url)
|
||||
return
|
||||
key = url
|
||||
key = re.sub("/[0-9.]+/", "/nnnn/", key)
|
||||
key = re.sub("/.+@.+/", "/x@x.org/", key)
|
||||
key = re.sub("#.*$", "", key)
|
||||
key = re.sub("\?.*$", "", key)
|
||||
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
|
||||
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
|
||||
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
|
||||
for slug in doc_types:
|
||||
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
|
||||
if not args.validate_all:
|
||||
# derive a key for urls like this by replacing primary keys
|
||||
key = re.sub("/[0-9.]+/", "/nnnn/", key)
|
||||
key = re.sub("/.+@.+/", "/x@x.org/", key)
|
||||
key = re.sub("#.*$", "", key)
|
||||
key = re.sub("\?.*$", "", key)
|
||||
key = re.sub("/rfc[0-9]+/", "/rfcnnnn/", key)
|
||||
key = re.sub("/wg/[a-z0-9-]+/", "/wg/foo/", key)
|
||||
key = re.sub("/rg/[a-z0-9-]+/", "/rg/foo/", key)
|
||||
key = re.sub("/ipr/[0-9]+/", "/ipr/nnnn/", key)
|
||||
key = re.sub("/draft-[a-z0-9-]+/", "/draft-foo/", key)
|
||||
for slug in doc_types:
|
||||
key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)
|
||||
|
||||
if not key in validated_urls:
|
||||
if hasattr(response, "content"):
|
||||
content = response.content
|
||||
|
@ -124,7 +139,10 @@ def check_html_valid(url, response, args):
|
|||
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
|
||||
tags.append("\n\t%s%s" %
|
||||
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
|
||||
warnings += 1
|
||||
# disregard some HTML issues that are (usually) due to invalid
|
||||
# database content
|
||||
if not re.search('Forbidden code point|Bad value|seamless|The first child', m["message"]):
|
||||
warnings += 1
|
||||
else:
|
||||
try:
|
||||
parser.parse(content)
|
||||
|
@ -140,6 +158,8 @@ def check_html_valid(url, response, args):
|
|||
def log(s):
|
||||
print(s)
|
||||
if logfile:
|
||||
if not type(s) is str:
|
||||
s = s.encode('utf-8')
|
||||
logfile.write(s)
|
||||
logfile.write('\n')
|
||||
|
||||
|
@ -157,6 +177,8 @@ def get_referrers(url):
|
|||
|
||||
slow_threshold = args.slow_threshold
|
||||
|
||||
|
||||
|
||||
visited = set()
|
||||
urls = {} # url -> referrer
|
||||
referrers = {}
|
||||
|
@ -196,20 +218,29 @@ logfile = None
|
|||
if args.logfile:
|
||||
logfile = open(args.logfile, "w")
|
||||
|
||||
validated_urls = {}
|
||||
|
||||
# --- Main ---
|
||||
|
||||
if __name__ == "__main__":
|
||||
if (args.user):
|
||||
# log in as user, to have the respective HTML generated by the templates
|
||||
response = client.post('/accounts/login/',
|
||||
{'username': args.user, 'password': 'password'},
|
||||
secure=True, follow=True)
|
||||
if (response.status_code != 200):
|
||||
log("Could not log in as %s, HTML response %d" %
|
||||
(args.user, response.status_code))
|
||||
sys.exit(1)
|
||||
|
||||
while urls:
|
||||
url, referrer = urls.popitem()
|
||||
# popitem() is documented to be random, but really isn't
|
||||
url = random.choice(urls.keys())
|
||||
referrer = urls.pop(url)
|
||||
|
||||
visited.add(url)
|
||||
|
||||
try:
|
||||
timestamp = datetime.datetime.now()
|
||||
r = client.get(url)
|
||||
r = client.get(url, secure=True, follow=True)
|
||||
elapsed = datetime.datetime.now() - timestamp
|
||||
except KeyboardInterrupt:
|
||||
log(" ... was fetching %s" % url)
|
||||
|
@ -278,6 +309,8 @@ if __name__ == "__main__":
|
|||
log("\nElapsed Visited Queue Code Time Url ... Notes")
|
||||
|
||||
log("%2d:%02d:%02d %7d %6d %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
|
||||
if ((errors or warnings) and args.pedantic):
|
||||
sys.exit(1)
|
||||
|
||||
if logfile:
|
||||
logfile.close()
|
||||
|
|
Loading…
Reference in a new issue