From 591e90a7a0d4c45b16a98d4bdb951b1f85a5b4d1 Mon Sep 17 00:00:00 2001 From: Ole Laursen Date: Sun, 12 Jan 2014 16:55:42 +0000 Subject: [PATCH] Put a proper command-line interface on the test crawler, spit out number of errors (if any), also add wrapper script for running the test-crawler and idindex generation scripts, in the future other (safe, non-mutating) scripts operating on real data can be added too - Legacy-Id: 7107 --- ietf/bin/run-real-data-tests | 38 +++++++++++++++++ ietf/bin/test-crawl | 51 +++++++++++++++++++---- ietf/idindex/generate_all_id2_txt.py | 1 + ietf/idindex/generate_all_id_txt.py | 1 + ietf/idindex/generate_id_abstracts_txt.py | 1 + ietf/idindex/generate_id_index_txt.py | 1 + ietf/utils/crawlurls.txt | 19 +++++++++ 7 files changed, 104 insertions(+), 8 deletions(-) create mode 100755 ietf/bin/run-real-data-tests mode change 100644 => 100755 ietf/idindex/generate_all_id2_txt.py mode change 100644 => 100755 ietf/idindex/generate_all_id_txt.py mode change 100644 => 100755 ietf/idindex/generate_id_abstracts_txt.py mode change 100644 => 100755 ietf/idindex/generate_id_index_txt.py create mode 100644 ietf/utils/crawlurls.txt diff --git a/ietf/bin/run-real-data-tests b/ietf/bin/run-real-data-tests new file mode 100755 index 000000000..9a8b51ffd --- /dev/null +++ b/ietf/bin/run-real-data-tests @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# +# Run some non-modifying tests on top of the real database, to +# exercise the code with real data. +# + +import os, subprocess, datetime + +base_dir = os.path.relpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) + +path = os.path.abspath(os.path.join(base_dir, "..")) +if os.environ.get("PYTHONPATH"): + path += ":" + os.environ.get("PYTHONPATH") +os.environ["PYTHONPATH"] = path + + + +def run_script(script, *args): + script_base = os.path.splitext(os.path.basename(script))[0] + script_path = os.path.join(base_dir, script) + output_path = os.path.join(base_dir, script_base) + arg_str = " " + " ".join(args) if args else "" + cmd_line = "%s%s > %s.output" % (script_path, arg_str, output_path) + print "Running %s" % cmd_line + before = datetime.datetime.now() + returncode = subprocess.call(cmd_line, shell=True) + print " (took %.3f seconds)" % (datetime.datetime.now() - before).total_seconds() + return returncode + +# idindex +run_script("idindex/generate_id_abstracts_txt.py") +run_script("idindex/generate_id_index_txt.py") +run_script("idindex/generate_all_id_txt.py") +run_script("idindex/generate_all_id2_txt.py") + +# test crawler +crawl_input = os.path.join(base_dir, "utils/crawlurls.txt") +run_script("bin/test-crawl", "--urls %s" % crawl_input) diff --git a/ietf/bin/test-crawl b/ietf/bin/test-crawl index 655bbb362..f4500d6e4 100755 --- a/ietf/bin/test-crawl +++ b/ietf/bin/test-crawl @@ -1,7 +1,6 @@ #!/usr/bin/env python -import os, sys, re, datetime, optparse, traceback -import syslog +import os, sys, re, datetime, argparse, traceback # boilerplate basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) @@ -19,9 +18,35 @@ class DontSaveQueries(object): connection.queries = DontSaveQueries() MAX_URL_LENGTH = 500 -SLOW_THRESHOLD = 1.0 -initial = ["/doc/all/", "/doc/in-last-call/", "/iesg/decisions/"] +# args +parser = argparse.ArgumentParser( + description="""Perform a test crawl of the project. For each found URL, the HTTP + response status is printed. If it's not OK/redirect, FAIL is + printed - in case of errors, a stacktrace is also included.""") +parser.add_argument('urls', metavar='URL', nargs='*', + help='One or more URLs to start the crawl from') +parser.add_argument('--urls', '-u', dest='url_file', + help='file with URLs to start the crawl from') +parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0, + help='responses taking longer than this (in seconds) results in SLOW being printed') + +args = parser.parse_args() + +slow_threshold = args.slow_threshold + +initial_urls = [] +initial_urls.extend(args.urls) + +if args.url_file: + with open(args.url_file) as f: + for line in f: + line = line.partition("#")[0].strip() + if line: + initial_urls.append(line) + +if not initial_urls: + initial_urls.append("/") visited = set() urls = {} # url -> referrer @@ -33,8 +58,11 @@ def strip_url(url): return url def extract_html_urls(content): - for m in re.finditer(r'<(?:a|link) [^>]*href="([^"]+)"', content): - url = strip_url(m.group(1)) + for m in re.finditer(r'(<(?:a|link) [^>]*href=[\'"]([^"]+)[\'"][^>]*>)', content): + if re.search(r'rel=["\']?nofollow["\']', m.group(1)): + continue + + url = strip_url(m.group(2)) if len(url) > MAX_URL_LENGTH: continue # avoid infinite GET parameter appendages @@ -45,9 +73,11 @@ def extract_html_urls(content): client = django.test.Client() -for url in initial: +for url in initial_urls: urls[url] = "[initial]" +errors = 0 + while urls: url, referrer = urls.popitem() @@ -65,6 +95,7 @@ while urls: print "=============" print traceback.format_exc() print "=============" + errors += 1 else: tags = [] @@ -90,9 +121,13 @@ while urls: print "=============" else: tags.append(u"FAIL (from %s)" % referrer) + errors += 1 - if elapsed.total_seconds() > SLOW_THRESHOLD: + if elapsed.total_seconds() > slow_threshold: tags.append("SLOW") print r.status_code, "%.3fs" % elapsed.total_seconds(), url, " ".join(tags) +if errors > 0: + sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors) + sys.exit(1) diff --git a/ietf/idindex/generate_all_id2_txt.py b/ietf/idindex/generate_all_id2_txt.py old mode 100644 new mode 100755 index 5d63368b9..846a38348 --- a/ietf/idindex/generate_all_id2_txt.py +++ b/ietf/idindex/generate_all_id2_txt.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Portions Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies). # All rights reserved. Contact: Pasi Eronen # diff --git a/ietf/idindex/generate_all_id_txt.py b/ietf/idindex/generate_all_id_txt.py old mode 100644 new mode 100755 index 1a4128f7a..e7ee5f6fd --- a/ietf/idindex/generate_all_id_txt.py +++ b/ietf/idindex/generate_all_id_txt.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). # All rights reserved. Contact: Pasi Eronen # diff --git a/ietf/idindex/generate_id_abstracts_txt.py b/ietf/idindex/generate_id_abstracts_txt.py old mode 100644 new mode 100755 index 91b5ca462..5136b884b --- a/ietf/idindex/generate_id_abstracts_txt.py +++ b/ietf/idindex/generate_id_abstracts_txt.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). # All rights reserved. Contact: Pasi Eronen # diff --git a/ietf/idindex/generate_id_index_txt.py b/ietf/idindex/generate_id_index_txt.py old mode 100644 new mode 100755 index a47903d7c..af9de9388 --- a/ietf/idindex/generate_id_index_txt.py +++ b/ietf/idindex/generate_id_index_txt.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Portions Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). # All rights reserved. Contact: Pasi Eronen # diff --git a/ietf/utils/crawlurls.txt b/ietf/utils/crawlurls.txt new file mode 100644 index 000000000..e2f7bdf2d --- /dev/null +++ b/ietf/utils/crawlurls.txt @@ -0,0 +1,19 @@ +# List of starting points for the test crawler (see +# bin/run-real-data-tests). Add URLs that have no link from inside the +# project. + +/ +/doc/all/ +/doc/iesg/last-call/ +/doc/in-last-call/ +/iesg/decisions/ +/iesg/agenda/documents.txt +/iesg/agenda/agenda.json +/iesg/agenda/scribe_template.html +/wg/1wg-summary.txt +/wg/1wg-summary-by-acronym.txt +/wg/1wg-charters.txt +/wg/1wg-charters-by-acronym.txt +/sitemap.xml +/sitemap-ipr.xml +/sitemap-liaison.xml