Fixed an issue with the test-crawler which could cause false positives for urls containing apostrophe.
- Legacy-Id: 12851
This commit is contained in:
parent
7b7a220df4
commit
c344a18bdf
|
@ -59,6 +59,7 @@ import debug # pyflakes:ignore
|
|||
# connection.queries = DontSaveQueries()
|
||||
|
||||
from ietf.name.models import DocTypeName
|
||||
from ietf.utils.html import unescape
|
||||
|
||||
# --- Constants ---
|
||||
|
||||
|
@ -83,7 +84,6 @@ def extract_html_urls(content):
|
|||
for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content):
|
||||
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
|
||||
continue
|
||||
|
||||
url = strip_url(m.group(2))
|
||||
if len(url) > MAX_URL_LENGTH:
|
||||
continue # avoid infinite GET parameter appendages
|
||||
|
@ -94,7 +94,7 @@ def extract_html_urls(content):
|
|||
if url.startswith("//"):
|
||||
continue
|
||||
|
||||
yield url
|
||||
yield unescape(url)
|
||||
|
||||
def extract_tastypie_urls(content):
|
||||
VISIT_OBJECTS = False
|
||||
|
@ -352,6 +352,7 @@ if __name__ == "__main__":
|
|||
try:
|
||||
for u in extract_html_urls(r.content):
|
||||
if u not in visited and u not in urls:
|
||||
debug.show('u')
|
||||
urls[u] = url
|
||||
referrers[u] = url
|
||||
|
||||
|
|
|
@ -48,3 +48,15 @@ def sanitize_html(html):
|
|||
quote_attr_values=True)
|
||||
output_generator = s.serialize(stream)
|
||||
return u''.join(output_generator)
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""
|
||||
Returns the given text with ampersands, quotes and angle brackets decoded
|
||||
for use in URLs.
|
||||
|
||||
This function undoes what django.utils.html.escape() does
|
||||
"""
|
||||
return text.replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' ).replace('&', '&')
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue