Replaced html sanitization code that called html5lib directly with calls to bleach, and upgraded the requirements to let us use the latest html5lib and bleach.

- Legacy-Id: 14739
This commit is contained in:
Henrik Levkowetz 2018-03-06 18:35:34 +00:00
parent b92ad2f992
commit 2828683cee
4 changed files with 6 additions and 43 deletions

4
PLAN
View file

@ -9,13 +9,9 @@ Planned work in rough order
* Revisit the review tool, work through the accumulated tickets.
* Add sanitization of uploaded html documents.
* Introduce an API for Meetecho to use to associate recordings with sessions
(and perhaps automate making copies of those videos)
* Upgrade html5lib to the latest release, the same for bleach which uses it.
* Reworked UI and refactored backend for the scretariat meeting scheduling
tool.

View file

@ -7,7 +7,7 @@ from django.contrib import messages
import debug # pyflakes:ignore
from ietf.utils.html import sanitize
from ietf.utils.html import sanitize_html
def handle_upload_file(file,filename,meeting,subdir, request=None):
'''
@ -38,7 +38,7 @@ def handle_upload_file(file,filename,meeting,subdir, request=None):
file.open()
text = file.read()
# Whole file sanitization; add back '<html>' (sanitize will remove it)
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
clean = u"<html>\n%s\n</html>\n" % sanitize_html(text)
destination.write(clean.encode('utf8'))
if request and clean != text:
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "

View file

@ -1,9 +1,7 @@
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
"""Utilities for working with HTML."""
import html5lib
import bleach
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
import debug # pyflakes:ignore
@ -26,36 +24,6 @@ acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
'width')
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
allowed_elements = acceptable_elements
allowed_attributes = acceptable_attributes
allowed_css_properties = ()
allowed_css_keywords = ()
allowed_svg_properties = ()
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, *args, **kwargs):
tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs)
def __iter__(self):
for token in tokenizer.HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def unescape(text):
"""
Returns the given text with ampersands, quotes and angle brackets decoded
@ -71,10 +39,9 @@ def remove_tags(html, tags):
return bleach.clean(html, tags=allowed)
remove_tags = keep_lazy(remove_tags, six.text_type)
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
tags = list(set(tags) | set(extra) ^ set(remove))
def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove))
return bleach.clean(html, tags=tags, strip=strip)
def clean_html(html):
return bleach.clean(html)

View file

@ -5,7 +5,7 @@ anora>=0.1.2
argon2-cffi>=16.1.0 # For the Argon2 password hasher option
beautifulsoup4>=4.4
bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations.
bleach>=1.5.0,<2.0.0
bleach>=2.0.0
coverage>=4.0.1,!=4.0.2
#cssselect>=0.6.1 # for PyQuery
decorator>=3.4.0
@ -23,7 +23,7 @@ factory-boy>=2.9.0
google-api-python-client
Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode.
hashids>=1.1.0
html5lib>=0.90,<0.99999999 # ietf.utils.html needs a rewrite for html5lib 1.x -- major code changes in sanitizer
html5lib>=1.0.1
httplib2>=0.10.3
jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/.
jwcrypto>=0.4.0 # for signed notifications