datatracker/ietf/utils/html.py
Henrik Levkowetz aa5e61d958 Updated all urlpatterns to use ietf.utils.urls.url() instead of django's,
in order to autogenerate dotted path url pattern names.  Updated a number
of url reverses to use dotted path, and removed explicit url pattern names
as needed.

Changed some imports to prevent import of ietf.urls before django
initialization was complete.


Changed 3 cases of form classes being curried to functions; django 1.10
didn't accept that.

Started converting old-style middleware classes to new-style middleware
functions (incomplete).

Tweaked a nomcom decorator to preserve function names and attributes, like
a good decorator should.

Replaced the removed django templatetag 'removetags' with our own version
which uses bleach, and does sanitizing in addition to removing explicitly
mentionied html tags.

Rewrote the filename argument handling in a management command which had
broken with the upgrade.
 - Legacy-Id: 12818
2017-02-11 14:43:01 +00:00

65 lines
2.6 KiB
Python

# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
"""Utilities for working with HTML."""
import html5lib
import bleach
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
from django.utils.functional import allow_lazy
from django.utils import six
acceptable_elements = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var')
acceptable_attributes = ('abbr', 'align', 'alt', 'axis', 'border',
'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'cite',
'cols', 'colspan', 'datetime', 'dir', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'lang', 'longdesc', 'name', 'nohref',
'noshade', 'nowrap', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope',
'span', 'src', 'start', 'summary', 'title', 'type', 'valign', 'vspace',
'width')
class HTMLSanitizerMixin(sanitizer.HTMLSanitizerMixin):
allowed_elements = acceptable_elements
allowed_attributes = acceptable_attributes
allowed_css_properties = ()
allowed_css_keywords = ()
allowed_svg_properties = ()
class HTMLSanitizer(tokenizer.HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, *args, **kwargs):
tokenizer.HTMLTokenizer.__init__(self, *args, **kwargs)
def __iter__(self):
for token in tokenizer.HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
def remove_tags(html, tags):
"""Returns the given HTML sanitized, and with the given tags removed."""
allowed = set(acceptable_elements) - set([ t.lower() for t in tags ])
return bleach.clean(html, tags=allowed)
remove_tags = allow_lazy(remove_tags, six.text_type)
def clean_html(html):
return bleach.clean(html)