Fixed an issue with the test-crawler which could cause false positives for urls containing apostrophe.

- Legacy-Id: 12851
This commit is contained in:
Henrik Levkowetz 2017-02-16 09:58:34 +00:00
parent 7b7a220df4
commit c344a18bdf
2 changed files with 15 additions and 2 deletions

View file

@ -59,6 +59,7 @@ import debug # pyflakes:ignore
# connection.queries = DontSaveQueries()
from ietf.name.models import DocTypeName
from ietf.utils.html import unescape
# --- Constants ---
@ -83,7 +84,6 @@ def extract_html_urls(content):
for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content):
if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
continue
url = strip_url(m.group(2))
if len(url) > MAX_URL_LENGTH:
continue # avoid infinite GET parameter appendages
@ -94,7 +94,7 @@ def extract_html_urls(content):
if url.startswith("//"):
continue
yield url
yield unescape(url)
def extract_tastypie_urls(content):
VISIT_OBJECTS = False
@ -352,6 +352,7 @@ if __name__ == "__main__":
try:
for u in extract_html_urls(r.content):
if u not in visited and u not in urls:
debug.show('u')
urls[u] = url
referrers[u] = url

View file

@ -48,3 +48,15 @@ def sanitize_html(html):
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def unescape(text):
"""
Returns the given text with ampersands, quotes and angle brackets decoded
for use in URLs.
This function undoes what django.utils.html.escape() does
"""
return text.replace('&#39;', "'").replace('&quot;', '"').replace('&gt;', '>').replace('&lt;', '<' ).replace('&amp;', '&')