Replaced html sanitization code that called html5lib directly with calls to bleach, and upgraded the requirements to let us use the latest html5lib and bleach.
- Legacy-Id: 14739
This commit is contained in:
parent
b92ad2f992
commit
2828683cee
4
PLAN
4
PLAN
|
@ -9,13 +9,9 @@ Planned work in rough order
|
||||||
|
|
||||||
* Revisit the review tool, work through the accumulated tickets.
|
* Revisit the review tool, work through the accumulated tickets.
|
||||||
|
|
||||||
* Add sanitization of uploaded html documents.
|
|
||||||
|
|
||||||
* Introduce an API for Meetecho to use to associate recordings with sessions
|
* Introduce an API for Meetecho to use to associate recordings with sessions
|
||||||
(and perhaps automate making copies of those videos)
|
(and perhaps automate making copies of those videos)
|
||||||
|
|
||||||
* Upgrade html5lib to the latest release, the same for bleach which uses it.
|
|
||||||
|
|
||||||
* Reworked UI and refactored backend for the scretariat meeting scheduling
|
* Reworked UI and refactored backend for the scretariat meeting scheduling
|
||||||
tool.
|
tool.
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ from django.contrib import messages
|
||||||
|
|
||||||
import debug # pyflakes:ignore
|
import debug # pyflakes:ignore
|
||||||
|
|
||||||
from ietf.utils.html import sanitize
|
from ietf.utils.html import sanitize_html
|
||||||
|
|
||||||
def handle_upload_file(file,filename,meeting,subdir, request=None):
|
def handle_upload_file(file,filename,meeting,subdir, request=None):
|
||||||
'''
|
'''
|
||||||
|
@ -38,7 +38,7 @@ def handle_upload_file(file,filename,meeting,subdir, request=None):
|
||||||
file.open()
|
file.open()
|
||||||
text = file.read()
|
text = file.read()
|
||||||
# Whole file sanitization; add back '<html>' (sanitize will remove it)
|
# Whole file sanitization; add back '<html>' (sanitize will remove it)
|
||||||
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
|
clean = u"<html>\n%s\n</html>\n" % sanitize_html(text)
|
||||||
destination.write(clean.encode('utf8'))
|
destination.write(clean.encode('utf8'))
|
||||||
if request and clean != text:
|
if request and clean != text:
|
||||||
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "
|
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
|
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
|
||||||
|
|
||||||
"""Utilities for working with HTML."""
|
"""Utilities for working with HTML."""
|
||||||
import html5lib
|
|
||||||
import bleach
|
import bleach
|
||||||
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
|
|
||||||
|
|
||||||
import debug # pyflakes:ignore
|
import debug # pyflakes:ignore
|
||||||
|
|
||||||
|
@ -26,36 +24,6 @@ acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
|
||||||
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
|
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
|
||||||
'width')
|
'width')
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
|
|
||||||
allowed_elements = acceptable_elements
|
|
||||||
allowed_attributes = acceptable_attributes
|
|
||||||
allowed_css_properties = ()
|
|
||||||
allowed_css_keywords = ()
|
|
||||||
allowed_svg_properties = ()
|
|
||||||
|
|
||||||
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for token in tokenizer.HTMLTokenizer.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
||||||
|
|
||||||
def sanitize_html(html):
|
|
||||||
"""Sanitizes an HTML fragment."""
|
|
||||||
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
|
|
||||||
tree=treebuilders.getTreeBuilder("dom"))
|
|
||||||
dom_tree = p.parseFragment(html)
|
|
||||||
walker = treewalkers.getTreeWalker("dom")
|
|
||||||
stream = walker(dom_tree)
|
|
||||||
s = serializer.HTMLSerializer(omit_optional_tags=False,
|
|
||||||
quote_attr_values=True)
|
|
||||||
output_generator = s.serialize(stream)
|
|
||||||
return u''.join(output_generator)
|
|
||||||
|
|
||||||
def unescape(text):
|
def unescape(text):
|
||||||
"""
|
"""
|
||||||
Returns the given text with ampersands, quotes and angle brackets decoded
|
Returns the given text with ampersands, quotes and angle brackets decoded
|
||||||
|
@ -71,10 +39,9 @@ def remove_tags(html, tags):
|
||||||
return bleach.clean(html, tags=allowed)
|
return bleach.clean(html, tags=allowed)
|
||||||
remove_tags = keep_lazy(remove_tags, six.text_type)
|
remove_tags = keep_lazy(remove_tags, six.text_type)
|
||||||
|
|
||||||
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
|
def sanitize_html(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
|
||||||
tags = list(set(tags) | set(extra) ^ set(remove))
|
tags = list(set(tags) | set(t.lower() for t in extra) ^ set(t.lower for t in remove))
|
||||||
return bleach.clean(html, tags=tags, strip=strip)
|
return bleach.clean(html, tags=tags, strip=strip)
|
||||||
|
|
||||||
def clean_html(html):
|
def clean_html(html):
|
||||||
return bleach.clean(html)
|
return bleach.clean(html)
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ anora>=0.1.2
|
||||||
argon2-cffi>=16.1.0 # For the Argon2 password hasher option
|
argon2-cffi>=16.1.0 # For the Argon2 password hasher option
|
||||||
beautifulsoup4>=4.4
|
beautifulsoup4>=4.4
|
||||||
bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations.
|
bibtexparser>=0.6.2,<1.0 # Version 1.0 doesn't work under python 2.7. 1.0.1 doesn't recognize month names or abbreviations.
|
||||||
bleach>=1.5.0,<2.0.0
|
bleach>=2.0.0
|
||||||
coverage>=4.0.1,!=4.0.2
|
coverage>=4.0.1,!=4.0.2
|
||||||
#cssselect>=0.6.1 # for PyQuery
|
#cssselect>=0.6.1 # for PyQuery
|
||||||
decorator>=3.4.0
|
decorator>=3.4.0
|
||||||
|
@ -23,7 +23,7 @@ factory-boy>=2.9.0
|
||||||
google-api-python-client
|
google-api-python-client
|
||||||
Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode.
|
Faker!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode.
|
||||||
hashids>=1.1.0
|
hashids>=1.1.0
|
||||||
html5lib>=0.90,<0.99999999 # ietf.utils.html needs a rewrite for html5lib 1.x -- major code changes in sanitizer
|
html5lib>=1.0.1
|
||||||
httplib2>=0.10.3
|
httplib2>=0.10.3
|
||||||
jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/.
|
jsonfield>=1.0.3 # for SubmissionCheck. This is https://github.com/bradjasper/django-jsonfield/.
|
||||||
jwcrypto>=0.4.0 # for signed notifications
|
jwcrypto>=0.4.0 # for signed notifications
|
||||||
|
|
Loading…
Reference in a new issue