Add HTML5 validation based on validator.nu to test-crawl. Commit ready for merge.
- Legacy-Id: 9726
This commit is contained in:
parent
cfe7442449
commit
5826bcbf80
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import os, sys, re, datetime, argparse, traceback, tempfile, json
|
import os, sys, re, datetime, argparse, traceback, tempfile, json, subprocess
|
||||||
import html5lib
|
import html5lib
|
||||||
import debug # pyflakes:ignore
|
import debug # pyflakes:ignore
|
||||||
|
|
||||||
|
@ -22,6 +22,8 @@ parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
|
||||||
help='responses taking longer than this (in seconds) results in SLOW being printed')
|
help='responses taking longer than this (in seconds) results in SLOW being printed')
|
||||||
parser.add_argument('--settings', dest='settings', help='custom settings file')
|
parser.add_argument('--settings', dest='settings', help='custom settings file')
|
||||||
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
|
parser.add_argument('--logfile', dest='logfile', help='write to logfile')
|
||||||
|
parser.add_argument('--vnu', action='store_true',
|
||||||
|
help='Use validator.nu instead of html5lib for HTML validation')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -92,7 +94,7 @@ def extract_tastypie_urls(content):
|
||||||
uri = object_list[i]["resource_uri"]
|
uri = object_list[i]["resource_uri"]
|
||||||
yield uri
|
yield uri
|
||||||
|
|
||||||
def check_html_valid(url, response):
|
def check_html_valid(url, response, vnu):
|
||||||
global parser, validated_urls, doc_types, warnings
|
global parser, validated_urls, doc_types, warnings
|
||||||
# derive a key for urls like this by replacing primary keys
|
# derive a key for urls like this by replacing primary keys
|
||||||
key = url
|
key = url
|
||||||
|
@ -110,15 +112,30 @@ def check_html_valid(url, response):
|
||||||
content = response.content
|
content = response.content
|
||||||
else:
|
else:
|
||||||
content = response.streaming_content
|
content = response.streaming_content
|
||||||
try:
|
validated_urls[key] = True
|
||||||
validated_urls[key] = True
|
if vnu:
|
||||||
parser.parse(content)
|
v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
|
||||||
except Exception:
|
"--format", "json", "-"],
|
||||||
e = SyntaxWarning("ParseError")
|
stdin=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
for err in parser.errors:
|
for m in json.loads(v.communicate(content)[1])["messages"]:
|
||||||
pos, code, data = err
|
t = m["subType"] if m["type"] == "info" else m["type"]
|
||||||
tags.append(u"WARN invalid html: Position %s: %s" % (pos, code))
|
tags.append("\n%s\tLine %d: %s" %
|
||||||
warnings += 1
|
(t.upper(), m["lastLine"], m["message"]))
|
||||||
|
tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
|
||||||
|
tags.append("\n\t%s%s" %
|
||||||
|
(" " * m["hiliteStart"], "^" * m["hiliteLength"]))
|
||||||
|
warnings += 1
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
parser.parse(content)
|
||||||
|
except Exception:
|
||||||
|
e = SyntaxWarning("ParseError")
|
||||||
|
for err in parser.errors:
|
||||||
|
pos, code, data = err
|
||||||
|
tags.append(u"WARN invalid html: Position %s: %s" %
|
||||||
|
(pos, code))
|
||||||
|
warnings += 1
|
||||||
|
|
||||||
|
|
||||||
def log(s):
|
def log(s):
|
||||||
print(s)
|
print(s)
|
||||||
|
@ -139,6 +156,7 @@ def get_referrers(url):
|
||||||
# --- GLobals ---
|
# --- GLobals ---
|
||||||
|
|
||||||
slow_threshold = args.slow_threshold
|
slow_threshold = args.slow_threshold
|
||||||
|
vnu = args.vnu
|
||||||
|
|
||||||
visited = set()
|
visited = set()
|
||||||
urls = {} # url -> referrer
|
urls = {} # url -> referrer
|
||||||
|
@ -224,7 +242,7 @@ if __name__ == "__main__":
|
||||||
urls[u] = url
|
urls[u] = url
|
||||||
referrers[u] = url
|
referrers[u] = url
|
||||||
|
|
||||||
check_html_valid(url, r)
|
check_html_valid(url, r, vnu)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
log("error extracting HTML urls from %s" % url)
|
log("error extracting HTML urls from %s" % url)
|
||||||
|
|
BIN
bin/vnu.jar
Normal file
BIN
bin/vnu.jar
Normal file
Binary file not shown.
Loading…
Reference in a new issue