feat: consolidate HTML sanitizing (#8471)

* refactor: isolate bleach code

* refactor: move html fns to html.py

* refactor: lose the bleach.py module; refactor

* refactor: sanitize_document -> clean_html

Drops <meta charset="utf-8"> addition after cleaning.

* fix: disambiguate import

* feat: restore <meta charset="utf-8"> tag

* chore: comments

* chore(deps): drop lxml_html_clean package

* refactor: on second thought, no meta charset

* refactor: sanitize_fragment -> clean_html

* test: remove check for charset

* chore: fix lint
This commit is contained in:
Jennifer Richards 2025-01-28 13:28:19 -04:00 committed by GitHub
parent 56f723a3bc
commit e91bda7e5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 110 additions and 114 deletions

View file

@ -26,10 +26,11 @@ import debug # pyflakes:ignore
from ietf.doc.models import BallotDocEvent, Document from ietf.doc.models import BallotDocEvent, Document
from ietf.doc.models import ConsensusDocEvent from ietf.doc.models import ConsensusDocEvent
from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication
from ietf.utils.html import sanitize_fragment
from ietf.utils import log from ietf.utils import log
from ietf.doc.utils import prettify_std_name from ietf.doc.utils import prettify_std_name
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, bleach_linker, bleach_cleaner, validate_url from ietf.utils.html import clean_html
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, linkify
from ietf.utils.validators import validate_url
register = template.Library() register = template.Library()
@ -98,7 +99,7 @@ def sanitize(value):
attributes to those deemed acceptable. See ietf/utils/html.py attributes to those deemed acceptable. See ietf/utils/html.py
for the details. for the details.
""" """
return mark_safe(sanitize_fragment(value)) return mark_safe(clean_html(value))
# For use with ballot view # For use with ballot view
@ -446,16 +447,16 @@ def ad_area(user):
@register.filter @register.filter
def format_history_text(text, trunc_words=25): def format_history_text(text, trunc_words=25):
"""Run history text through some cleaning and add ellipsis if it's too long.""" """Run history text through some cleaning and add ellipsis if it's too long."""
full = mark_safe(bleach_cleaner.clean(text)) full = mark_safe(clean_html(text))
full = bleach_linker.linkify(urlize_ietf_docs(full)) full = linkify(urlize_ietf_docs(full))
return format_snippet(full, trunc_words) return format_snippet(full, trunc_words)
@register.filter @register.filter
def format_snippet(text, trunc_words=25): def format_snippet(text, trunc_words=25):
# urlize if there aren't already links present # urlize if there aren't already links present
text = bleach_linker.linkify(text) text = linkify(text)
full = keep_spacing(collapsebr(linebreaksbr(mark_safe(sanitize_fragment(text))))) full = keep_spacing(collapsebr(linebreaksbr(mark_safe(clean_html(text)))))
snippet = truncatewords_html(full, trunc_words) snippet = truncatewords_html(full, trunc_words)
if snippet != full: if snippet != full:
return mark_safe('<div class="snippet">%s<button type="button" aria-label="Expand" class="btn btn-sm btn-primary show-all"><i class="bi bi-caret-down"></i></button></div><div class="d-none full">%s</div>' % (snippet, full)) return mark_safe('<div class="snippet">%s<button type="button" aria-label="Expand" class="btn btn-sm btn-primary show-all"><i class="bi bi-caret-down"></i></button></div><div class="d-none full">%s</div>' % (snippet, full))

View file

@ -6423,8 +6423,7 @@ class MaterialsTests(TestCase):
text = doc.text() text = doc.text()
self.assertIn('Some text', text) self.assertIn('Some text', text)
self.assertNotIn('<section>', text) self.assertNotIn('<section>', text)
self.assertIn('charset="utf-8"', text)
# txt upload # txt upload
test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.') test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.')
test_file.name = "some.txt" test_file.name = "some.txt"

View file

@ -30,7 +30,7 @@ from ietf.group.utils import can_manage_materials
from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName
from ietf.person.models import Person from ietf.person.models import Person
from ietf.stats.models import MeetingRegistration from ietf.stats.models import MeetingRegistration
from ietf.utils.html import sanitize_document from ietf.utils.html import clean_html
from ietf.utils.log import log from ietf.utils.log import log
from ietf.utils.timezone import date_today from ietf.utils.timezone import date_today
@ -773,8 +773,8 @@ def handle_upload_file(file, filename, meeting, subdir, request=None, encoding=N
return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120]) return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120])
# Whole file sanitization; add back what's missing from a complete # Whole file sanitization; add back what's missing from a complete
# document (sanitize will remove these). # document (sanitize will remove these).
clean = sanitize_document(text) clean = clean_html(text)
destination.write(clean.encode('utf8')) destination.write(clean.encode("utf8"))
if request and clean != text: if request and clean != text:
messages.warning(request, messages.warning(request,
( (

View file

@ -5,11 +5,7 @@
import bleach import bleach
import copy
import html2text import html2text
import lxml.etree
import lxml.html
import lxml.html.clean
import debug # pyflakes:ignore import debug # pyflakes:ignore
@ -17,62 +13,66 @@ from django import forms
from django.utils.functional import keep_lazy from django.utils.functional import keep_lazy
from ietf.utils.mime import get_mime_type from ietf.utils.mime import get_mime_type
from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
def unescape(text): # Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
""" # to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
Returns the given text with ampersands, quotes and angle brackets decoded # of bleach's defaults.
for use in URLs. acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
{"http", "https", "mailto", "ftp", "xmpp"}
)
acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
{
# fmt: off
"a", "abbr", "acronym", "address", "b", "big",
"blockquote", "body", "br", "caption", "center", "cite", "code", "col",
"colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
"tr", "tt", "u", "ul", "var"
# fmt: on
}
)
acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
"*": ["id"],
"ol": ["start"],
}
# Instantiate sanitizer classes
_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=acceptable_tags,
attributes=acceptable_attributes,
protocols=acceptable_protocols,
strip=True,
)
_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=acceptable_tags.union({"img", "figure", "figcaption"}),
attributes=acceptable_attributes | {"img": ["src", "alt"]},
protocols=acceptable_protocols,
strip=True,
)
def clean_html(text: str):
"""Clean the HTML in a string"""
return _bleach_cleaner.clean(text)
def liberal_clean_html(text: str):
"""More permissively clean the HTML in a string"""
return _liberal_bleach_cleaner.clean(text)
This function undoes what django.utils.html.escape() does
"""
return text.replace('&amp;', '&').replace('&#39;', "'").replace('&quot;', '"').replace('&gt;', '>').replace('&lt;', '<' )
@keep_lazy(str) @keep_lazy(str)
def remove_tags(html, tags): def remove_tags(html, tags):
"""Returns the given HTML sanitized, and with the given tags removed.""" """Returns the given HTML sanitized, and with the given tags removed."""
allowed = set(acceptable_tags) - set([ t.lower() for t in tags ]) allowed = acceptable_tags - set(t.lower() for t in tags)
return bleach.clean(html, tags=allowed, strip=True) return bleach.clean(html, tags=allowed, strip=True)
# ----------------------------------------------------------------------
# Html fragment cleaning
def sanitize_fragment(html):
return bleach_cleaner.clean(html)
# ----------------------------------------------------------------------
# Page cleaning
class Cleaner(lxml.html.clean.Cleaner):
charset = 'utf-8'
def __init__(self, charset='utf-8', **kw):
self.charset = charset
super(Cleaner, self).__init__(**kw)
# Copied from lxml 4.2.0 and modified to insert charset meta:
def clean_html(self, html):
result_type = type(html)
if isinstance(html, (str, bytes)):
doc = lxml.html.fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
head = doc.find('head')
if head != None:
meta = lxml.etree.Element('meta', charset=self.charset)
meta.tail = '\n'
head.insert(0, meta)
return lxml.html._transform_result(result_type, doc)
# We will be saving as utf-8 later, so set that in the meta tag.
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
def sanitize_document(html):
return lxml_cleaner.clean_html(html)
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Text field cleaning # Text field cleaning
@ -86,4 +86,15 @@ def clean_text_field(text):
else: else:
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type) raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
return text return text
def unescape(text):
"""
Returns the given text with ampersands, quotes and angle brackets decoded
for use in URLs.
This function undoes what django.utils.html.escape() does
"""
return text.replace('&amp;', '&').replace('&#39;', "'").replace('&quot;', '"').replace('&gt;', '>').replace('&lt;', '<' )

View file

@ -12,13 +12,15 @@ from markdown.postprocessors import Postprocessor
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
from ietf.utils.text import bleach_cleaner, liberal_bleach_cleaner, bleach_linker from .html import clean_html, liberal_clean_html
from .text import linkify
class LinkifyExtension(Extension): class LinkifyExtension(Extension):
""" """
Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify, Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify,
but using our bleach_linker directly. Doing the linkification on the converted but using our own linker directly. Doing the linkification on the converted
Markdown output introduces artifacts. Markdown output introduces artifacts.
""" """
@ -31,12 +33,12 @@ class LinkifyExtension(Extension):
class LinkifyPostprocessor(Postprocessor): class LinkifyPostprocessor(Postprocessor):
def run(self, text): def run(self, text):
return urlize_ietf_docs(bleach_linker.linkify(text)) return urlize_ietf_docs(linkify(text))
def markdown(text): def markdown(text):
return mark_safe( return mark_safe(
bleach_cleaner.clean( clean_html(
python_markdown.markdown( python_markdown.markdown(
text, text,
extensions=[ extensions=[
@ -52,7 +54,7 @@ def markdown(text):
def liberal_markdown(text): def liberal_markdown(text):
return mark_safe( return mark_safe(
liberal_bleach_cleaner.clean( liberal_clean_html(
python_markdown.markdown( python_markdown.markdown(
text, text,
extensions=[ extensions=[

View file

@ -11,7 +11,7 @@ from django.utils.safestring import mark_safe
import debug # pyflakes:ignore import debug # pyflakes:ignore
from ietf.utils.text import xslugify as _xslugify, texescape, bleach_linker from ietf.utils.text import linkify as _linkify, xslugify as _xslugify, texescape
register = template.Library() register = template.Library()
@ -74,7 +74,7 @@ def texescape_filter(value):
@register.filter @register.filter
@stringfilter @stringfilter
def linkify(value): def linkify(value):
text = mark_safe(bleach_linker.linkify(value)) text = mark_safe(_linkify(value))
return text return text
@register.filter @register.filter

View file

@ -1,17 +1,15 @@
# Copyright The IETF Trust 2016-2020, All Rights Reserved # Copyright The IETF Trust 2016-2020, All Rights Reserved
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import bleach
import bleach # type: ignore
import copy
import email import email
import re import re
import textwrap import textwrap
import tlds import tlds
import unicodedata import unicodedata
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from django.utils.functional import keep_lazy from django.utils.functional import keep_lazy
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
@ -19,66 +17,52 @@ import debug # pyflakes:ignore
from .texescape import init as texescape_init, tex_escape_map from .texescape import init as texescape_init, tex_escape_map
tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True) # Sort in reverse so substrings are considered later - e.g., so ".co" comes after ".com".
protocols = set(bleach.sanitizer.ALLOWED_PROTOCOLS) tlds_sorted = sorted(tlds.tld_set, reverse=True)
protocols.add("ftp") # we still have some ftp links
protocols.add("xmpp") # we still have some xmpp links
tags = set(bleach.sanitizer.ALLOWED_TAGS).union( # Protocols we're interested in auto-linking. See also ietf.utils.html.acceptable_protocols,
{ # which is protocols we allow people to include explicitly in sanitized html.
# fmt: off linkable_protocols = ["http", "https", "mailto", "ftp", "xmpp"]
'a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var'
# fmt: on
}
)
attributes = copy.copy(bleach.sanitizer.ALLOWED_ATTRIBUTES)
attributes["*"] = ["id"]
attributes["ol"] = ["start"]
bleach_cleaner = bleach.sanitizer.Cleaner( _validate_url = URLValidator()
tags=tags, attributes=attributes, protocols=protocols, strip=True
)
liberal_tags = copy.copy(tags)
liberal_attributes = copy.copy(attributes)
liberal_tags.update(["img","figure","figcaption"])
liberal_attributes["img"] = ["src","alt"]
liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=liberal_tags, attributes=liberal_attributes, protocols=protocols, strip=True
)
validate_url = URLValidator()
def check_url_validity(attrs, new=False): def check_url_validity(attrs, new=False):
"""Callback for bleach linkify
:param attrs: dict of attributes of the <a> tag
:param new: boolean - True if the link is new; False if <a> was found in text
:return: new dict of attributes for the link, or None to block link creation
Attributes are namespaced, so normally look like `(None, "SomeAttribute")`.
This includes as the keys in the `attrs` argument, so `attrs[(None, "href")]`
would be the value of the href attribute.
"""
if (None, "href") not in attrs: if (None, "href") not in attrs:
# rfc2html creates a tags without href # rfc2html creates a tags without href
return attrs return attrs
url = attrs[(None, "href")] url = attrs[(None, "href")]
try: try:
if url.startswith("http"): if url.startswith("http"):
validate_url(url) _validate_url(url)
except ValidationError: except ValidationError:
return None return None
return attrs return attrs
bleach_linker = bleach.Linker( _bleach_linker = bleach.Linker(
callbacks=[check_url_validity], callbacks=[check_url_validity],
url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=protocols), url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=linkable_protocols),
email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore
parse_email=True, parse_email=True,
) )
def linkify(text):
return _bleach_linker.linkify(text)
@keep_lazy(str) @keep_lazy(str)
def xslugify(value): def xslugify(value):
""" """

View file

@ -43,8 +43,7 @@ jsonfield>=3.1.0 # for SubmissionCheck. This is https://github.com/bradjaspe
jsonschema[format]>=4.2.1 jsonschema[format]>=4.2.1
jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used. jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used.
logging_tree>=1.9 # Used only by the showloggers management command logging_tree>=1.9 # Used only by the showloggers management command
lxml>=5.3.0 # lxml[html_clean] fails on some architectures lxml>=5.3.0
lxml_html_clean>=0.4.1
markdown>=3.3.6 markdown>=3.3.6
types-markdown>=3.3.6 types-markdown>=3.3.6
mock>=4.0.3 # Used only by tests, of course mock>=4.0.3 # Used only by tests, of course