* refactor: isolate bleach code * refactor: move html fns to html.py * refactor: lose the bleach.py module; refactor * refactor: sanitize_document -> clean_html Drops <meta charset="utf-8"> addition after cleaning. * fix: disambiguate import * feat: restore <meta charset="utf-8"> tag * chore: comments * chore(deps): drop lxml_html_clean package * refactor: on second thought, no meta charset * refactor: sanitize_fragment -> clean_html * test: remove check for charset * chore: fix lint
101 lines
3.1 KiB
Python
101 lines
3.1 KiB
Python
# Copyright The IETF Trust 2010-2020, All Rights Reserved
|
|
# -*- coding: utf-8 -*-
|
|
# Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py
|
|
"""Utilities for working with HTML."""
|
|
|
|
|
|
import bleach
|
|
import html2text
|
|
|
|
import debug # pyflakes:ignore
|
|
|
|
from django import forms
|
|
from django.utils.functional import keep_lazy
|
|
|
|
from ietf.utils.mime import get_mime_type
|
|
|
|
|
|
# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
|
|
# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
|
|
# of bleach's defaults.
|
|
acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
|
|
{"http", "https", "mailto", "ftp", "xmpp"}
|
|
)
|
|
acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
|
|
{
|
|
# fmt: off
|
|
"a", "abbr", "acronym", "address", "b", "big",
|
|
"blockquote", "body", "br", "caption", "center", "cite", "code", "col",
|
|
"colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
|
|
"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
|
|
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
|
|
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
|
|
"tr", "tt", "u", "ul", "var"
|
|
# fmt: on
|
|
}
|
|
)
|
|
acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
|
|
"*": ["id"],
|
|
"ol": ["start"],
|
|
}
|
|
|
|
|
|
# Instantiate sanitizer classes
|
|
_bleach_cleaner = bleach.sanitizer.Cleaner(
|
|
tags=acceptable_tags,
|
|
attributes=acceptable_attributes,
|
|
protocols=acceptable_protocols,
|
|
strip=True,
|
|
)
|
|
|
|
|
|
_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
|
|
tags=acceptable_tags.union({"img", "figure", "figcaption"}),
|
|
attributes=acceptable_attributes | {"img": ["src", "alt"]},
|
|
protocols=acceptable_protocols,
|
|
strip=True,
|
|
)
|
|
|
|
|
|
def clean_html(text: str):
|
|
"""Clean the HTML in a string"""
|
|
return _bleach_cleaner.clean(text)
|
|
|
|
|
|
def liberal_clean_html(text: str):
|
|
"""More permissively clean the HTML in a string"""
|
|
return _liberal_bleach_cleaner.clean(text)
|
|
|
|
|
|
@keep_lazy(str)
|
|
def remove_tags(html, tags):
|
|
"""Returns the given HTML sanitized, and with the given tags removed."""
|
|
allowed = acceptable_tags - set(t.lower() for t in tags)
|
|
return bleach.clean(html, tags=allowed, strip=True)
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Text field cleaning
|
|
|
|
def clean_text_field(text):
|
|
mime_type, encoding = get_mime_type(text.encode('utf8'))
|
|
if mime_type == 'text/html': # or re.search(r'<\w+>', text):
|
|
text = html2text.html2text(text)
|
|
elif mime_type in ['text/plain', 'application/x-empty', ]:
|
|
pass
|
|
else:
|
|
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
|
|
return text
|
|
|
|
|
|
def unescape(text):
|
|
"""
|
|
Returns the given text with ampersands, quotes and angle brackets decoded
|
|
for use in URLs.
|
|
|
|
This function undoes what django.utils.html.escape() does
|
|
"""
|
|
return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' )
|
|
|
|
|