diff --git a/bin/test-crawl b/bin/test-crawl index 6d94a24fc..36bf327c2 100755 --- a/bin/test-crawl +++ b/bin/test-crawl @@ -1,6 +1,6 @@ #!/usr/bin/env python -import os, sys, re, datetime, argparse, traceback, tempfile, json +import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess import html5lib import debug # pyflakes:ignore @@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0, help='responses taking longer than this (in seconds) results in SLOW being printed') parser.add_argument('--settings', dest='settings', help='custom settings file') parser.add_argument('--logfile', dest='logfile', help='write to logfile') +parser.add_argument('--vnu', action='store_true', + help='Use validator.nu instead of html5lib for HTML validation') args = parser.parse_args() @@ -92,7 +94,7 @@ def extract_tastypie_urls(content): uri = object_list[i]["resource_uri"] yield uri -def check_html_valid(url, response): +def check_html_valid(url, response, vnu): global parser, validated_urls, doc_types, warnings # derive a key for urls like this by replacing primary keys key = url @@ -110,15 +112,30 @@ def check_html_valid(url, response): content = response.content else: content = response.streaming_content - try: - validated_urls[key] = True - parser.parse(content) - except Exception: - e = SyntaxWarning("ParseError") - for err in parser.errors: - pos, code, data = err - tags.append(u"WARN invalid html: Position %s: %s" % (pos, code)) - warnings += 1 + validated_urls[key] = True + if vnu: + v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar", + "--format", "json", "-"], + stdin=subprocess.PIPE, stderr=subprocess.PIPE) + for m in json.loads(v.communicate(content)[1])["messages"]: + t = m["subType"] if m["type"] == "info" else m["type"] + tags.append("\n%s\tLine %d: %s" % + (t.upper(), m["lastLine"], m["message"])) + tags.append("\n\t%s" % m["extract"].replace('\n', ' ')) + tags.append("\n\t%s%s" % + (" " * m["hiliteStart"], "^" * m["hiliteLength"])) + warnings += 1 + else: + try: + parser.parse(content) + except Exception: + e = SyntaxWarning("ParseError") + for err in parser.errors: + pos, code, data = err + tags.append(u"WARN invalid html: Position %s: %s" % + (pos, code)) + warnings += 1 + def log(s): print(s) @@ -139,6 +156,7 @@ def get_referrers(url): # --- GLobals --- slow_threshold = args.slow_threshold +vnu = args.vnu visited = set() urls = {} # url -> referrer @@ -224,7 +242,7 @@ if __name__ == "__main__": urls[u] = url referrers[u] = url - check_html_valid(url, r) + check_html_valid(url, r, vnu) except: log("error extracting HTML urls from %s" % url) diff --git a/bin/vnu.jar b/bin/vnu.jar new file mode 100644 index 000000000..298e3a68a Binary files /dev/null and b/bin/vnu.jar differ