Replaced html sanitization code that called html5lib directly with calls to bleach, and upgraded the requirements to let us use the latest html5lib and bleach.
- Legacy-Id: 14739
This commit is contained in:
parent
b92ad2f992
commit
2828683cee
4
PLAN
4
PLAN
|
@ -9,13 +9,9 @@ Planned work in rough order
|
|||
|
||||
* Revisit the review tool, work through the accumulated tickets.
|
||||
|
||||
* Add sanitization of uploaded html documents.
|
||||
|
||||
* Introduce an API for Meetecho to use to associate recordings with sessions
|
||||
(and perhaps automate making copies of those videos)
|
||||
|
||||
* Upgrade html5lib to the latest release, the same for bleach which uses it.
|
||||
|
||||
* Reworked UI and refactored backend for the scretariat meeting scheduling
|
||||
tool.
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from django.contrib import messages
|
|||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
from ietf.utils.html import sanitize
|
||||
from ietf.utils.html import sanitize_html
|
||||
|
||||
def handle_upload_file(file,filename,meeting,subdir, request=None):
|
||||
'''
|
||||
|
@ -38,7 +38,7 @@ def handle_upload_file(file,filename,meeting,subdir, request=None):
|
|||
file.open()
|
||||
text = file.read()
|
||||
# Whole file sanitization; add back '<html>' (sanitize will remove it)
|
||||
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
|
||||
clean = u"<html>\n%s\n</html>\n" % sanitize_html(text)
|
||||
destination.write(clean.encode('utf8'))
|
||||
if request and clean != text:
|
||||
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
|
||||
|
||||
"""Utilities for working with HTML."""
|
||||
import html5lib
|
||||
import bleach
|
||||
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
|
||||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
|
@ -26,36 +24,6 @@ acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
|
|||
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
|
||||
'width')
|
||||
|
||||
|
||||
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
|
||||
allowed_elements = acceptable_elements
|
||||
allowed_attributes = acceptable_attributes
|
||||
allowed_css_properties = ()
|
||||
allowed_css_keywords = ()
|
||||
allowed_svg_properties = ()
|
||||
|
||||
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs)
|
||||
|
||||
def __iter__(self):
|
||||
for token in tokenizer.HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
|
||||
def sanitize_html(html):
|
||||
"""Sanitizes an HTML fragment."""
|
||||
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
|
||||
tree=treebuilders.getTreeBuilder("dom"))
|
||||
dom_tree = p.parseFragment(html)
|
||||
walker = treewalkers.getTreeWalker("dom")
|
||||
stream = walker(dom_tree)
|
||||
s = serializer.HTMLSerializer(omit_optional_tags=False,
|
||||
quote_attr_values=True)
|
||||
output_generator = s.serialize(stream)
|
||||
return u''.join(output_generator)
|
||||
|
||||
def unescape(text):
|
||||
"""
|
||||
Returns the given text with ampersands, quotes and angle brackets decoded
|
||||
|
@ -71,10 +39,9 @@ def remove_tags(html, tags):
|
|||
return bleach.clean(html, tags=allowed)
|
||||
remove_tags = keep_lazy(remove_tags, six.text_type)
|
||||
|
||||
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
|
||||
tags = list(set(tags) | set(extra) ^ set(remove))
|
||||
def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
|
||||
tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove))
|
||||
return bleach.clean(html, tags=tags, strip=strip)
|
||||
|
||||
def clean_html(html):
|
||||
return bleach.clean(html)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ anora>=0.1.2
|
|||
argon2-cffi>=16.1.0 # For the Argon2 password hasher option
|
||||
beautifulsoup4>=4.4
|
||||
bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations.
|
||||
bleach>=1.5.0,<2.0.0
|
||||
bleach>=2.0.0
|
||||
coverage>=4.0.1,!=4.0.2
|
||||
#cssselect>=0.6.1 # for PyQuery
|
||||
decorator>=3.4.0
|
||||
|
@ -23,7 +23,7 @@ factory-boy>=2.9.0
|
|||
google-api-python-client
|
||||
Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode.
|
||||
hashids>=1.1.0
|
||||
html5lib>=0.90,<0.99999999 # ietf.utils.html needs a rewrite for html5lib 1.x -- major code changes in sanitizer
|
||||
html5lib>=1.0.1
|
||||
httplib2>=0.10.3
|
||||
jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/.
|
||||
jwcrypto>=0.4.0 # for signed notifications
|
||||
|
|
Loading…
Reference in a new issue