feat: consolidate HTML sanitizing (#8471)

* refactor: isolate bleach code

* refactor: move html fns to html.py

* refactor: lose the bleach.py module; refactor

* refactor: sanitize_document -> clean_html

Drops <meta charset="utf-8"> addition after cleaning.

* fix: disambiguate import

* feat: restore <meta charset="utf-8"> tag

* chore: comments

* chore(deps): drop lxml_html_clean package

* refactor: on second thought, no meta charset

* refactor: sanitize_fragment -> clean_html

* test: remove check for charset

* chore: fix lint
This commit is contained in:
Jennifer Richards 2025-01-28 13:28:19 -04:00 committed by GitHub
parent 56f723a3bc
commit e91bda7e5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 110 additions and 114 deletions

View file

@ -26,10 +26,11 @@ import debug # pyflakes:ignore
from ietf.doc.models import BallotDocEvent, Document
from ietf.doc.models import ConsensusDocEvent
from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication
from ietf.utils.html import sanitize_fragment
from ietf.utils import log
from ietf.doc.utils import prettify_std_name
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, bleach_linker, bleach_cleaner, validate_url
from ietf.utils.html import clean_html
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, linkify
from ietf.utils.validators import validate_url
register = template.Library()
@ -98,7 +99,7 @@ def sanitize(value):
attributes to those deemed acceptable. See ietf/utils/html.py
for the details.
"""
return mark_safe(sanitize_fragment(value))
return mark_safe(clean_html(value))
# For use with ballot view
@ -446,16 +447,16 @@ def ad_area(user):
@register.filter
def format_history_text(text, trunc_words=25):
"""Run history text through some cleaning and add ellipsis if it's too long."""
full = mark_safe(bleach_cleaner.clean(text))
full = bleach_linker.linkify(urlize_ietf_docs(full))
full = mark_safe(clean_html(text))
full = linkify(urlize_ietf_docs(full))
return format_snippet(full, trunc_words)
@register.filter
def format_snippet(text, trunc_words=25):
# urlize if there aren't already links present
text = bleach_linker.linkify(text)
full = keep_spacing(collapsebr(linebreaksbr(mark_safe(sanitize_fragment(text)))))
text = linkify(text)
full = keep_spacing(collapsebr(linebreaksbr(mark_safe(clean_html(text)))))
snippet = truncatewords_html(full, trunc_words)
if snippet != full:
return mark_safe('<div class="snippet">%s<button type="button" aria-label="Expand" class="btn btn-sm btn-primary show-all"><i class="bi bi-caret-down"></i></button></div><div class="d-none full">%s</div>' % (snippet, full))

View file

@ -6423,8 +6423,7 @@ class MaterialsTests(TestCase):
text = doc.text()
self.assertIn('Some text', text)
self.assertNotIn('<section>', text)
self.assertIn('charset="utf-8"', text)
# txt upload
test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.')
test_file.name = "some.txt"

View file

@ -30,7 +30,7 @@ from ietf.group.utils import can_manage_materials
from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName
from ietf.person.models import Person
from ietf.stats.models import MeetingRegistration
from ietf.utils.html import sanitize_document
from ietf.utils.html import clean_html
from ietf.utils.log import log
from ietf.utils.timezone import date_today
@ -773,8 +773,8 @@ def handle_upload_file(file, filename, meeting, subdir, request=None, encoding=N
return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120])
# Whole file sanitization; add back what's missing from a complete
# document (sanitize will remove these).
clean = sanitize_document(text)
destination.write(clean.encode('utf8'))
clean = clean_html(text)
destination.write(clean.encode("utf8"))
if request and clean != text:
messages.warning(request,
(

View file

@ -5,11 +5,7 @@
import bleach
import copy
import html2text
import lxml.etree
import lxml.html
import lxml.html.clean
import debug # pyflakes:ignore
@ -17,62 +13,66 @@ from django import forms
from django.utils.functional import keep_lazy
from ietf.utils.mime import get_mime_type
from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
def unescape(text):
"""
Returns the given text with ampersands, quotes and angle brackets decoded
for use in URLs.
# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
# of bleach's defaults.
acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
{"http", "https", "mailto", "ftp", "xmpp"}
)
acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
{
# fmt: off
"a", "abbr", "acronym", "address", "b", "big",
"blockquote", "body", "br", "caption", "center", "cite", "code", "col",
"colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
"tr", "tt", "u", "ul", "var"
# fmt: on
}
)
acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
"*": ["id"],
"ol": ["start"],
}
# Instantiate sanitizer classes
_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=acceptable_tags,
attributes=acceptable_attributes,
protocols=acceptable_protocols,
strip=True,
)
_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=acceptable_tags.union({"img", "figure", "figcaption"}),
attributes=acceptable_attributes | {"img": ["src", "alt"]},
protocols=acceptable_protocols,
strip=True,
)
def clean_html(text: str):
"""Clean the HTML in a string"""
return _bleach_cleaner.clean(text)
def liberal_clean_html(text: str):
"""More permissively clean the HTML in a string"""
return _liberal_bleach_cleaner.clean(text)
This function undoes what django.utils.html.escape() does
"""
return text.replace('&amp;', '&').replace('&#39;', "'").replace('&quot;', '"').replace('&gt;', '>').replace('&lt;', '<' )
@keep_lazy(str)
def remove_tags(html, tags):
"""Returns the given HTML sanitized, and with the given tags removed."""
allowed = set(acceptable_tags) - set([ t.lower() for t in tags ])
allowed = acceptable_tags - set(t.lower() for t in tags)
return bleach.clean(html, tags=allowed, strip=True)
# ----------------------------------------------------------------------
# Html fragment cleaning
def sanitize_fragment(html):
return bleach_cleaner.clean(html)
# ----------------------------------------------------------------------
# Page cleaning
class Cleaner(lxml.html.clean.Cleaner):
charset = 'utf-8'
def __init__(self, charset='utf-8', **kw):
self.charset = charset
super(Cleaner, self).__init__(**kw)
# Copied from lxml 4.2.0 and modified to insert charset meta:
def clean_html(self, html):
result_type = type(html)
if isinstance(html, (str, bytes)):
doc = lxml.html.fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
head = doc.find('head')
if head != None:
meta = lxml.etree.Element('meta', charset=self.charset)
meta.tail = '\n'
head.insert(0, meta)
return lxml.html._transform_result(result_type, doc)
# We will be saving as utf-8 later, so set that in the meta tag.
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
def sanitize_document(html):
return lxml_cleaner.clean_html(html)
# ----------------------------------------------------------------------
# Text field cleaning
@ -86,4 +86,15 @@ def clean_text_field(text):
else:
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
return text
def unescape(text):
"""
Returns the given text with ampersands, quotes and angle brackets decoded
for use in URLs.
This function undoes what django.utils.html.escape() does
"""
return text.replace('&amp;', '&').replace('&#39;', "'").replace('&quot;', '"').replace('&gt;', '>').replace('&lt;', '<' )

View file

@ -12,13 +12,15 @@ from markdown.postprocessors import Postprocessor
from django.utils.safestring import mark_safe
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
from ietf.utils.text import bleach_cleaner, liberal_bleach_cleaner, bleach_linker
from .html import clean_html, liberal_clean_html
from .text import linkify
class LinkifyExtension(Extension):
"""
Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify,
but using our bleach_linker directly. Doing the linkification on the converted
but using our own linker directly. Doing the linkification on the converted
Markdown output introduces artifacts.
"""
@ -31,12 +33,12 @@ class LinkifyExtension(Extension):
class LinkifyPostprocessor(Postprocessor):
def run(self, text):
return urlize_ietf_docs(bleach_linker.linkify(text))
return urlize_ietf_docs(linkify(text))
def markdown(text):
return mark_safe(
bleach_cleaner.clean(
clean_html(
python_markdown.markdown(
text,
extensions=[
@ -52,7 +54,7 @@ def markdown(text):
def liberal_markdown(text):
return mark_safe(
liberal_bleach_cleaner.clean(
liberal_clean_html(
python_markdown.markdown(
text,
extensions=[

View file

@ -11,7 +11,7 @@ from django.utils.safestring import mark_safe
import debug # pyflakes:ignore
from ietf.utils.text import xslugify as _xslugify, texescape, bleach_linker
from ietf.utils.text import linkify as _linkify, xslugify as _xslugify, texescape
register = template.Library()
@ -74,7 +74,7 @@ def texescape_filter(value):
@register.filter
@stringfilter
def linkify(value):
text = mark_safe(bleach_linker.linkify(value))
text = mark_safe(_linkify(value))
return text
@register.filter

View file

@ -1,17 +1,15 @@
# Copyright The IETF Trust 2016-2020, All Rights Reserved
# -*- coding: utf-8 -*-
import bleach # type: ignore
import copy
import bleach
import email
import re
import textwrap
import tlds
import unicodedata
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from django.utils.functional import keep_lazy
from django.utils.safestring import mark_safe
@ -19,66 +17,52 @@ import debug # pyflakes:ignore
from .texescape import init as texescape_init, tex_escape_map
tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True)
protocols = set(bleach.sanitizer.ALLOWED_PROTOCOLS)
protocols.add("ftp") # we still have some ftp links
protocols.add("xmpp") # we still have some xmpp links
# Sort in reverse so substrings are considered later - e.g., so ".co" comes after ".com".
tlds_sorted = sorted(tlds.tld_set, reverse=True)
tags = set(bleach.sanitizer.ALLOWED_TAGS).union(
{
# fmt: off
'a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var'
# fmt: on
}
)
# Protocols we're interested in auto-linking. See also ietf.utils.html.acceptable_protocols,
# which is protocols we allow people to include explicitly in sanitized html.
linkable_protocols = ["http", "https", "mailto", "ftp", "xmpp"]
attributes = copy.copy(bleach.sanitizer.ALLOWED_ATTRIBUTES)
attributes["*"] = ["id"]
attributes["ol"] = ["start"]
bleach_cleaner = bleach.sanitizer.Cleaner(
tags=tags, attributes=attributes, protocols=protocols, strip=True
)
liberal_tags = copy.copy(tags)
liberal_attributes = copy.copy(attributes)
liberal_tags.update(["img","figure","figcaption"])
liberal_attributes["img"] = ["src","alt"]
liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=liberal_tags, attributes=liberal_attributes, protocols=protocols, strip=True
)
validate_url = URLValidator()
_validate_url = URLValidator()
def check_url_validity(attrs, new=False):
"""Callback for bleach linkify
:param attrs: dict of attributes of the <a> tag
:param new: boolean - True if the link is new; False if <a> was found in text
:return: new dict of attributes for the link, or None to block link creation
Attributes are namespaced, so normally look like `(None, "SomeAttribute")`.
This includes as the keys in the `attrs` argument, so `attrs[(None, "href")]`
would be the value of the href attribute.
"""
if (None, "href") not in attrs:
# rfc2html creates a tags without href
return attrs
url = attrs[(None, "href")]
try:
if url.startswith("http"):
validate_url(url)
_validate_url(url)
except ValidationError:
return None
return attrs
bleach_linker = bleach.Linker(
_bleach_linker = bleach.Linker(
callbacks=[check_url_validity],
url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=protocols),
url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=linkable_protocols),
email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore
parse_email=True,
)
def linkify(text):
return _bleach_linker.linkify(text)
@keep_lazy(str)
def xslugify(value):
"""

View file

@ -43,8 +43,7 @@ jsonfield>=3.1.0 # for SubmissionCheck. This is https://github.com/bradjaspe
jsonschema[format]>=4.2.1
jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used.
logging_tree>=1.9 # Used only by the showloggers management command
lxml>=5.3.0 # lxml[html_clean] fails on some architectures
lxml_html_clean>=0.4.1
lxml>=5.3.0
markdown>=3.3.6
types-markdown>=3.3.6
mock>=4.0.3 # Used only by tests, of course