datatracker/ietf/bin/test-crawl
Ole Laursen b552ff31aa Replace the announcement pages with a new page collecting all IESG
review decisions. Add migration to split up iesg_approve/disapprove
events of the form "IESG has approved ... and state has been changed
..." into the approve/disapprove event and a synthesized state change
event. Also regularize the descriptions a bit. This simplifies the
code in the new page.
 - Legacy-Id: 6340
2013-10-02 15:37:44 +00:00

102 lines
2.8 KiB
Python
Executable file

#!/usr/bin/env python
import os, sys, re, datetime, optparse, traceback
import syslog
# boilerplate
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
from ietf import settings
from django.core import management
management.setup_environ(settings)
import django.test
from django.conf import settings
# prevent memory from leaking when settings.DEBUG=True
from django.db import connection
class DontSaveQueries(object):
def append(self, x):
pass
connection.queries = DontSaveQueries()
MAX_URL_LENGTH = 500
SLOW_THRESHOLD = 1.0
initial = ["/doc/all/", "/doc/in-last-call/", "/iesg/decisions/"]
visited = set()
urls = {} # url -> referrer
def strip_url(url):
if url.startswith("http://testserver"):
url = url[len("http://testserver"):]
return url
def extract_html_urls(content):
for m in re.finditer(r'<a.*href="([^"]+)">', content):
url = strip_url(m.group(1))
if len(url) > MAX_URL_LENGTH:
continue # avoid infinite GET parameter appendages
if not url.startswith("/"):
continue
yield url
client = django.test.Client()
for url in initial:
urls[url] = "[initial]"
while urls:
url, referrer = urls.popitem()
visited.add(url)
try:
timestamp = datetime.datetime.now()
r = client.get(url)
elapsed = datetime.datetime.now() - timestamp
except KeyboardInterrupt:
print "was fetching", url
sys.exit(1)
except:
print 500, "%.3fs" % (datetime.datetime.now() - timestamp).total_seconds(), url, "FAIL (from %s)" % referrer
print "============="
print traceback.format_exc()
print "============="
else:
tags = []
if r.status_code in (301, 302):
u = strip_url(r["Location"])
if u not in visited and u not in urls:
urls[u] = referrer # referrer is original referrer, not redirected url
elif r.status_code == 200:
ctype = r["Content-Type"]
if ";" in ctype:
ctype = ctype[:ctype.index(";")]
if ctype == "text/html":
try:
for u in extract_html_urls(r.content):
if u not in visited and u not in urls:
urls[u] = url
except:
print "error extracting HTML urls from", url
print "============="
print traceback.format_exc()
print "============="
else:
tags.append(u"FAIL (from %s)" % referrer)
if elapsed.total_seconds() > SLOW_THRESHOLD:
tags.append("SLOW")
print r.status_code, "%.3fs" % elapsed.total_seconds(), url, " ".join(tags)