Add HTML5 validation based on validator.nu to test-crawl. Commit ready for merge.

- Legacy-Id: 9726
This commit is contained in:
Lars Eggert 2015-07-15 12:41:09 +00:00
parent cfe7442449
commit 5826bcbf80
2 changed files with 30 additions and 12 deletions

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import os, sys, re, datetime, argparse, traceback, tempfile, json import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
import html5lib import html5lib
import debug # pyflakes:ignore import debug # pyflakes:ignore
@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
help='responses taking longer than this (in seconds) results in SLOW being printed') help='responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', dest='settings', help='custom settings file') parser.add_argument('--settings', dest='settings', help='custom settings file')
parser.add_argument('--logfile', dest='logfile', help='write to logfile') parser.add_argument('--logfile', dest='logfile', help='write to logfile')
parser.add_argument('--vnu', action='store_true',
help='Use validator.nu instead of html5lib for HTML validation')
args = parser.parse_args() args = parser.parse_args()
@ -92,7 +94,7 @@ def extract_tastypie_urls(content):
uri = object_list[i]["resource_uri"] uri = object_list[i]["resource_uri"]
yield uri yield uri
def check_html_valid(url, response): def check_html_valid(url, response, vnu):
global parser, validated_urls, doc_types, warnings global parser, validated_urls, doc_types, warnings
# derive a key for urls like this by replacing primary keys # derive a key for urls like this by replacing primary keys
key = url key = url
@ -110,15 +112,30 @@ def check_html_valid(url, response):
content = response.content content = response.content
else: else:
content = response.streaming_content content = response.streaming_content
try: validated_urls[key] = True
validated_urls[key] = True if vnu:
parser.parse(content) v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
except Exception: "--format", "json", "-"],
e = SyntaxWarning("ParseError") stdin=subprocess.PIPE, stderr=subprocess.PIPE)
for err in parser.errors: for m in json.loads(v.communicate(content)[1])["messages"]:
pos, code, data = err t = m["subType"] if m["type"] == "info" else m["type"]
tags.append(u"WARN invalid html: Position %s: %s" % (pos, code)) tags.append("\n%s\tLine %d: %s" %
warnings += 1 (t.upper(), m["lastLine"], m["message"]))
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
tags.append("\n\t%s%s" %
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
warnings += 1
else:
try:
parser.parse(content)
except Exception:
e = SyntaxWarning("ParseError")
for err in parser.errors:
pos, code, data = err
tags.append(u"WARN invalid html: Position %s: %s" %
(pos, code))
warnings += 1
def log(s): def log(s):
print(s) print(s)
@ -139,6 +156,7 @@ def get_referrers(url):
# --- GLobals --- # --- GLobals ---
slow_threshold = args.slow_threshold slow_threshold = args.slow_threshold
vnu = args.vnu
visited = set() visited = set()
urls = {} # url -> referrer urls = {} # url -> referrer
@ -224,7 +242,7 @@ if __name__ == "__main__":
urls[u] = url urls[u] = url
referrers[u] = url referrers[u] = url
check_html_valid(url, r) check_html_valid(url, r, vnu)
except: except:
log("error extracting HTML urls from %s" % url) log("error extracting HTML urls from %s" % url)

BIN
bin/vnu.jar Normal file

Binary file not shown.