From c344a18bdf9dc98a941348755083b0e41f7b4852 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Thu, 16 Feb 2017 09:58:34 +0000 Subject: [PATCH] Fixed an issue with the test-crawler which could cause false positives for urls containing apostrophe. - Legacy-Id: 12851 --- bin/test-crawl | 5 +++-- ietf/utils/html.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/bin/test-crawl b/bin/test-crawl index 8823a50d2..e9e221792 100755 --- a/bin/test-crawl +++ b/bin/test-crawl @@ -59,6 +59,7 @@ import debug # pyflakes:ignore # connection.queries = DontSaveQueries() from ietf.name.models import DocTypeName +from ietf.utils.html import unescape # --- Constants --- @@ -83,7 +84,6 @@ def extract_html_urls(content): for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content): if re.search(r'rel=["\']?nofollow["\']', m.group(1)): continue - url = strip_url(m.group(2)) if len(url) > MAX_URL_LENGTH: continue # avoid infinite GET parameter appendages @@ -94,7 +94,7 @@ def extract_html_urls(content): if url.startswith("//"): continue - yield url + yield unescape(url) def extract_tastypie_urls(content): VISIT_OBJECTS = False @@ -352,6 +352,7 @@ if __name__ == "__main__": try: for u in extract_html_urls(r.content): if u not in visited and u not in urls: + debug.show('u') urls[u] = url referrers[u] = url diff --git a/ietf/utils/html.py b/ietf/utils/html.py index c0f717cc9..30ccd00ee 100644 --- a/ietf/utils/html.py +++ b/ietf/utils/html.py @@ -48,3 +48,15 @@ def sanitize_html(html): quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator) + + +def unescape(text): + """ + Returns the given text with ampersands, quotes and angle brackets decoded + for use in URLs. + + This function undoes what django.utils.html.escape() does + """ + return text.replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' ).replace('&', '&') + +