Add HTML5 validation based on validator.nu to test-crawl. Commit ready for merge.

- Legacy-Id: 9726
This commit is contained in:
Lars Eggert 2015-07-15 12:41:09 +00:00
parent cfe7442449
commit 5826bcbf80
2 changed files with 30 additions and 12 deletions

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python
import os, sys, re, datetime, argparse, traceback, tempfile, json
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
import html5lib
import debug # pyflakes:ignore
@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', dest='settings', help='custom settings file')
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
parser.add_argument('--vnu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
args = parser.parse_args()
@ -92,7 +94,7 @@ def extract_tastypie_urls(content):
uri = object_list[i]["resource_uri"]
yield uri
def check_html_valid(url, response):
def check_html_valid(url, response, vnu):
global parser, validated_urls, doc_types, warnings
# derive a key for urls like this by replacing primary keys
key = url
@ -110,15 +112,30 @@ def check_html_valid(url, response):
content = response.content
else:
content = response.streaming_content
try:
validated_urls[key] = True
parser.parse(content)
except Exception:
e = SyntaxWarning("ParseError")
for err in parser.errors:
pos, code, data = err
tags.append(u"WARN invalid html: Position %s: %s" % (pos, code))
warnings += 1
validated_urls[key] = True
if vnu:
v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
"--format", "json", "-"],
stdin=subprocess.PIPE, stderr=subprocess.PIPE)
for m in json.loads(v.communicate(content)[1])["messages"]:
t = m["subType"] if m["type"] == "info" else m["type"]
tags.append("\n%s\tLine %d: %s" %
(t.upper(), m["lastLine"], m["message"]))
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
tags.append("\n\t%s%s" %
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
warnings += 1
else:
try:
parser.parse(content)
except Exception:
e = SyntaxWarning("ParseError")
for err in parser.errors:
pos, code, data = err
tags.append(u"WARN invalid html: Position %s: %s" %
(pos, code))
warnings += 1
def log(s):
print(s)
@ -139,6 +156,7 @@ def get_referrers(url):
# --- GLobals ---
slow_threshold = args.slow_threshold
vnu = args.vnu
visited = set()
urls = {} # url -> referrer
@ -224,7 +242,7 @@ if __name__ == "__main__":
urls[u] = url
referrers[u] = url
check_html_valid(url, r)
check_html_valid(url, r, vnu)
except:
log("error extracting HTML urls from %s" % url)

BIN
bin/vnu.jar Normal file

Binary file not shown.