From 2828683cee14c65beb0efc2cb4b0b69fd6fee937 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Tue, 6 Mar 2018 18:35:34 +0000 Subject: [PATCH] Replaced html sanitization code that called html5lib directly with calls to bleach, and upgraded the requirements to let us use the latest html5lib and bleach. - Legacy-Id: 14739 --- PLAN | 4 ---- ietf/secr/proceedings/utils.py | 4 ++-- ietf/utils/html.py | 37 ++-------------------------------- requirements.txt | 4 ++-- 4 files changed, 6 insertions(+), 43 deletions(-) diff --git a/PLAN b/PLAN index 2729f3d98..4c7b8a921 100644 --- a/PLAN +++ b/PLAN @@ -9,13 +9,9 @@ Planned work in rough order * Revisit the review tool, work through the accumulated tickets. -* Add sanitization of uploaded html documents. - * Introduce an API for Meetecho to use to associate recordings with sessions (and perhaps automate making copies of those videos) -* Upgrade html5lib to the latest release, the same for bleach which uses it. - * Reworked UI and refactored backend for the scretariat meeting scheduling tool. diff --git a/ietf/secr/proceedings/utils.py b/ietf/secr/proceedings/utils.py index a83fa7008..ea4a5f114 100644 --- a/ietf/secr/proceedings/utils.py +++ b/ietf/secr/proceedings/utils.py @@ -7,7 +7,7 @@ from django.contrib import messages import debug # pyflakes:ignore -from ietf.utils.html import sanitize +from ietf.utils.html import sanitize_html def handle_upload_file(file,filename,meeting,subdir, request=None): ''' @@ -38,7 +38,7 @@ def handle_upload_file(file,filename,meeting,subdir, request=None): file.open() text = file.read() # Whole file sanitization; add back '' (sanitize will remove it) - clean = u"\n%s\n\n" % sanitize(text) + clean = u"\n%s\n\n" % sanitize_html(text) destination.write(clean.encode('utf8')) if request and clean != text: messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. " diff --git a/ietf/utils/html.py b/ietf/utils/html.py index 398bc3203..5bbb7d354 100644 --- a/ietf/utils/html.py +++ b/ietf/utils/html.py @@ -1,9 +1,7 @@ # Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py """Utilities for working with HTML.""" -import html5lib import bleach -from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers import debug # pyflakes:ignore @@ -26,36 +24,6 @@ acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border', 'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace', 'width') - -class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin): - allowed_elements = acceptable_elements - allowed_attributes = acceptable_attributes - allowed_css_properties = () - allowed_css_keywords = () - allowed_svg_properties = () - -class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin): - def __init__(self, *args, **kwargs): - tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs) - - def __iter__(self): - for token in tokenizer.HTMLTokenizer.__iter__(self): - token = self.sanitize_token(token) - if token: - yield token - -def sanitize_html(html): - """Sanitizes an HTML fragment.""" - p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, - tree=treebuilders.getTreeBuilder("dom")) - dom_tree = p.parseFragment(html) - walker = treewalkers.getTreeWalker("dom") - stream = walker(dom_tree) - s = serializer.HTMLSerializer(omit_optional_tags=False, - quote_attr_values=True) - output_generator = s.serialize(stream) - return u''.join(output_generator) - def unescape(text): """ Returns the given text with ampersands, quotes and angle brackets decoded @@ -71,10 +39,9 @@ def remove_tags(html, tags): return bleach.clean(html, tags=allowed) remove_tags = keep_lazy(remove_tags, six.text_type) -def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True): - tags = list(set(tags) | set(extra) ^ set(remove)) +def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True): + tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove)) return bleach.clean(html, tags=tags, strip=strip) def clean_html(html): return bleach.clean(html) - diff --git a/requirements.txt b/requirements.txt index d18ed910f..b5b8bb631 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ anora>=0.1.2 argon2-cffi>=16.1.0 # For the Argon2 password hasher option beautifulsoup4>=4.4 bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations. -bleach>=1.5.0,<2.0.0 +bleach>=2.0.0 coverage>=4.0.1,!=4.0.2 #cssselect>=0.6.1 # for PyQuery decorator>=3.4.0 @@ -23,7 +23,7 @@ factory-boy>=2.9.0 google-api-python-client Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode. hashids>=1.1.0 -html5lib>=0.90,<0.99999999 # ietf.utils.html needs a rewrite for html5lib 1.x -- major code changes in sanitizer +html5lib>=1.0.1 httplib2>=0.10.3 jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/. jwcrypto>=0.4.0 # for signed notifications