Put a proper command-line interface on the test crawler, spit out number of errors (if any), also add wrapper script for running the test-crawler and idindex generation scripts, in the future other (safe, non-mutating) scripts operating on real data can be added too
- Legacy-Id: 7107
This commit is contained in:
parent
be8eb96bec
commit
591e90a7a0
38
ietf/bin/run-real-data-tests
Executable file
38
ietf/bin/run-real-data-tests
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Run some non-modifying tests on top of the real database, to
|
||||
# exercise the code with real data.
|
||||
#
|
||||
|
||||
import os, subprocess, datetime
|
||||
|
||||
base_dir = os.path.relpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
|
||||
|
||||
path = os.path.abspath(os.path.join(base_dir, ".."))
|
||||
if os.environ.get("PYTHONPATH"):
|
||||
path += ":" + os.environ.get("PYTHONPATH")
|
||||
os.environ["PYTHONPATH"] = path
|
||||
|
||||
|
||||
|
||||
def run_script(script, *args):
|
||||
script_base = os.path.splitext(os.path.basename(script))[0]
|
||||
script_path = os.path.join(base_dir, script)
|
||||
output_path = os.path.join(base_dir, script_base)
|
||||
arg_str = " " + " ".join(args) if args else ""
|
||||
cmd_line = "%s%s > %s.output" % (script_path, arg_str, output_path)
|
||||
print "Running %s" % cmd_line
|
||||
before = datetime.datetime.now()
|
||||
returncode = subprocess.call(cmd_line, shell=True)
|
||||
print " (took %.3f seconds)" % (datetime.datetime.now() - before).total_seconds()
|
||||
return returncode
|
||||
|
||||
# idindex
|
||||
run_script("idindex/generate_id_abstracts_txt.py")
|
||||
run_script("idindex/generate_id_index_txt.py")
|
||||
run_script("idindex/generate_all_id_txt.py")
|
||||
run_script("idindex/generate_all_id2_txt.py")
|
||||
|
||||
# test crawler
|
||||
crawl_input = os.path.join(base_dir, "utils/crawlurls.txt")
|
||||
run_script("bin/test-crawl", "--urls %s" % crawl_input)
|
|
@ -1,7 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os, sys, re, datetime, optparse, traceback
|
||||
import syslog
|
||||
import os, sys, re, datetime, argparse, traceback
|
||||
|
||||
# boilerplate
|
||||
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
|
@ -19,9 +18,35 @@ class DontSaveQueries(object):
|
|||
connection.queries = DontSaveQueries()
|
||||
|
||||
MAX_URL_LENGTH = 500
|
||||
SLOW_THRESHOLD = 1.0
|
||||
|
||||
initial = ["/doc/all/", "/doc/in-last-call/", "/iesg/decisions/"]
|
||||
# args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Perform a test crawl of the project. For each found URL, the HTTP
|
||||
response status is printed. If it's not OK/redirect, FAIL is
|
||||
printed - in case of errors, a stacktrace is also included.""")
|
||||
parser.add_argument('urls', metavar='URL', nargs='*',
|
||||
help='One or more URLs to start the crawl from')
|
||||
parser.add_argument('--urls', '-u', dest='url_file',
|
||||
help='file with URLs to start the crawl from')
|
||||
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
|
||||
help='responses taking longer than this (in seconds) results in SLOW being printed')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
slow_threshold = args.slow_threshold
|
||||
|
||||
initial_urls = []
|
||||
initial_urls.extend(args.urls)
|
||||
|
||||
if args.url_file:
|
||||
with open(args.url_file) as f:
|
||||
for line in f:
|
||||
line = line.partition("#")[0].strip()
|
||||
if line:
|
||||
initial_urls.append(line)
|
||||
|
||||
if not initial_urls:
|
||||
initial_urls.append("/")
|
||||
|
||||
visited = set()
|
||||
urls = {} # url -> referrer
|
||||
|
@ -33,8 +58,11 @@ def strip_url(url):
|
|||
return url
|
||||
|
||||
def extract_html_urls(content):
|
||||
for m in re.finditer(r'<(?:a|link) [^>]*href="([^"]+)"', content):
|
||||
url = strip_url(m.group(1))
|
||||
for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content):
|
||||
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
|
||||
continue
|
||||
|
||||
url = strip_url(m.group(2))
|
||||
if len(url) > MAX_URL_LENGTH:
|
||||
continue # avoid infinite GET parameter appendages
|
||||
|
||||
|
@ -45,9 +73,11 @@ def extract_html_urls(content):
|
|||
|
||||
client = django.test.Client()
|
||||
|
||||
for url in initial:
|
||||
for url in initial_urls:
|
||||
urls[url] = "[initial]"
|
||||
|
||||
errors = 0
|
||||
|
||||
while urls:
|
||||
url, referrer = urls.popitem()
|
||||
|
||||
|
@ -65,6 +95,7 @@ while urls:
|
|||
print "============="
|
||||
print traceback.format_exc()
|
||||
print "============="
|
||||
errors += 1
|
||||
else:
|
||||
tags = []
|
||||
|
||||
|
@ -90,9 +121,13 @@ while urls:
|
|||
print "============="
|
||||
else:
|
||||
tags.append(u"FAIL (from %s)" % referrer)
|
||||
errors += 1
|
||||
|
||||
if elapsed.total_seconds() > SLOW_THRESHOLD:
|
||||
if elapsed.total_seconds() > slow_threshold:
|
||||
tags.append("SLOW")
|
||||
|
||||
print r.status_code, "%.3fs" % elapsed.total_seconds(), url, " ".join(tags)
|
||||
|
||||
if errors > 0:
|
||||
sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
|
||||
sys.exit(1)
|
||||
|
|
1
ietf/idindex/generate_all_id2_txt.py
Normal file → Executable file
1
ietf/idindex/generate_all_id2_txt.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
# Portions Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies).
|
||||
# All rights reserved. Contact: Pasi Eronen <pasi.eronen@nokia.com>
|
||||
#
|
||||
|
|
1
ietf/idindex/generate_all_id_txt.py
Normal file → Executable file
1
ietf/idindex/generate_all_id_txt.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
# Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
||||
# All rights reserved. Contact: Pasi Eronen <pasi.eronen@nokia.com>
|
||||
#
|
||||
|
|
1
ietf/idindex/generate_id_abstracts_txt.py
Normal file → Executable file
1
ietf/idindex/generate_id_abstracts_txt.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
# Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
||||
# All rights reserved. Contact: Pasi Eronen <pasi.eronen@nokia.com>
|
||||
#
|
||||
|
|
1
ietf/idindex/generate_id_index_txt.py
Normal file → Executable file
1
ietf/idindex/generate_id_index_txt.py
Normal file → Executable file
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
# Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
||||
# All rights reserved. Contact: Pasi Eronen <pasi.eronen@nokia.com>
|
||||
#
|
||||
|
|
19
ietf/utils/crawlurls.txt
Normal file
19
ietf/utils/crawlurls.txt
Normal file
|
@ -0,0 +1,19 @@
|
|||
# List of starting points for the test crawler (see
|
||||
# bin/run-real-data-tests). Add URLs that have no link from inside the
|
||||
# project.
|
||||
|
||||
/
|
||||
/doc/all/
|
||||
/doc/iesg/last-call/
|
||||
/doc/in-last-call/
|
||||
/iesg/decisions/
|
||||
/iesg/agenda/documents.txt
|
||||
/iesg/agenda/agenda.json
|
||||
/iesg/agenda/scribe_template.html
|
||||
/wg/1wg-summary.txt
|
||||
/wg/1wg-summary-by-acronym.txt
|
||||
/wg/1wg-charters.txt
|
||||
/wg/1wg-charters-by-acronym.txt
|
||||
/sitemap.xml
|
||||
/sitemap-ipr.xml
|
||||
/sitemap-liaison.xml
|
Loading…
Reference in a new issue