feat: consolidate HTML sanitizing (#8471)
* refactor: isolate bleach code * refactor: move html fns to html.py * refactor: lose the bleach.py module; refactor * refactor: sanitize_document -> clean_html Drops <meta charset="utf-8"> addition after cleaning. * fix: disambiguate import * feat: restore <meta charset="utf-8"> tag * chore: comments * chore(deps): drop lxml_html_clean package * refactor: on second thought, no meta charset * refactor: sanitize_fragment -> clean_html * test: remove check for charset * chore: fix lint
This commit is contained in:
parent
56f723a3bc
commit
e91bda7e5e
|
@ -26,10 +26,11 @@ import debug # pyflakes:ignore
|
|||
from ietf.doc.models import BallotDocEvent, Document
|
||||
from ietf.doc.models import ConsensusDocEvent
|
||||
from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication
|
||||
from ietf.utils.html import sanitize_fragment
|
||||
from ietf.utils import log
|
||||
from ietf.doc.utils import prettify_std_name
|
||||
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, bleach_linker, bleach_cleaner, validate_url
|
||||
from ietf.utils.html import clean_html
|
||||
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, linkify
|
||||
from ietf.utils.validators import validate_url
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
@ -98,7 +99,7 @@ def sanitize(value):
|
|||
attributes to those deemed acceptable. See ietf/utils/html.py
|
||||
for the details.
|
||||
"""
|
||||
return mark_safe(sanitize_fragment(value))
|
||||
return mark_safe(clean_html(value))
|
||||
|
||||
|
||||
# For use with ballot view
|
||||
|
@ -446,16 +447,16 @@ def ad_area(user):
|
|||
@register.filter
|
||||
def format_history_text(text, trunc_words=25):
|
||||
"""Run history text through some cleaning and add ellipsis if it's too long."""
|
||||
full = mark_safe(bleach_cleaner.clean(text))
|
||||
full = bleach_linker.linkify(urlize_ietf_docs(full))
|
||||
full = mark_safe(clean_html(text))
|
||||
full = linkify(urlize_ietf_docs(full))
|
||||
|
||||
return format_snippet(full, trunc_words)
|
||||
|
||||
@register.filter
|
||||
def format_snippet(text, trunc_words=25):
|
||||
# urlize if there aren't already links present
|
||||
text = bleach_linker.linkify(text)
|
||||
full = keep_spacing(collapsebr(linebreaksbr(mark_safe(sanitize_fragment(text)))))
|
||||
text = linkify(text)
|
||||
full = keep_spacing(collapsebr(linebreaksbr(mark_safe(clean_html(text)))))
|
||||
snippet = truncatewords_html(full, trunc_words)
|
||||
if snippet != full:
|
||||
return mark_safe('<div class="snippet">%s<button type="button" aria-label="Expand" class="btn btn-sm btn-primary show-all"><i class="bi bi-caret-down"></i></button></div><div class="d-none full">%s</div>' % (snippet, full))
|
||||
|
|
|
@ -6423,8 +6423,7 @@ class MaterialsTests(TestCase):
|
|||
text = doc.text()
|
||||
self.assertIn('Some text', text)
|
||||
self.assertNotIn('<section>', text)
|
||||
self.assertIn('charset="utf-8"', text)
|
||||
|
||||
|
||||
# txt upload
|
||||
test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.')
|
||||
test_file.name = "some.txt"
|
||||
|
|
|
@ -30,7 +30,7 @@ from ietf.group.utils import can_manage_materials
|
|||
from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName
|
||||
from ietf.person.models import Person
|
||||
from ietf.stats.models import MeetingRegistration
|
||||
from ietf.utils.html import sanitize_document
|
||||
from ietf.utils.html import clean_html
|
||||
from ietf.utils.log import log
|
||||
from ietf.utils.timezone import date_today
|
||||
|
||||
|
@ -773,8 +773,8 @@ def handle_upload_file(file, filename, meeting, subdir, request=None, encoding=N
|
|||
return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120])
|
||||
# Whole file sanitization; add back what's missing from a complete
|
||||
# document (sanitize will remove these).
|
||||
clean = sanitize_document(text)
|
||||
destination.write(clean.encode('utf8'))
|
||||
clean = clean_html(text)
|
||||
destination.write(clean.encode("utf8"))
|
||||
if request and clean != text:
|
||||
messages.warning(request,
|
||||
(
|
||||
|
|
|
@ -5,11 +5,7 @@
|
|||
|
||||
|
||||
import bleach
|
||||
import copy
|
||||
import html2text
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
import lxml.html.clean
|
||||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
|
@ -17,62 +13,66 @@ from django import forms
|
|||
from django.utils.functional import keep_lazy
|
||||
|
||||
from ietf.utils.mime import get_mime_type
|
||||
from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
|
||||
|
||||
acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
|
||||
|
||||
def unescape(text):
|
||||
"""
|
||||
Returns the given text with ampersands, quotes and angle brackets decoded
|
||||
for use in URLs.
|
||||
# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
|
||||
# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
|
||||
# of bleach's defaults.
|
||||
acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
|
||||
{"http", "https", "mailto", "ftp", "xmpp"}
|
||||
)
|
||||
acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
|
||||
{
|
||||
# fmt: off
|
||||
"a", "abbr", "acronym", "address", "b", "big",
|
||||
"blockquote", "body", "br", "caption", "center", "cite", "code", "col",
|
||||
"colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
|
||||
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
|
||||
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
|
||||
"tr", "tt", "u", "ul", "var"
|
||||
# fmt: on
|
||||
}
|
||||
)
|
||||
acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
|
||||
"*": ["id"],
|
||||
"ol": ["start"],
|
||||
}
|
||||
|
||||
|
||||
# Instantiate sanitizer classes
|
||||
_bleach_cleaner = bleach.sanitizer.Cleaner(
|
||||
tags=acceptable_tags,
|
||||
attributes=acceptable_attributes,
|
||||
protocols=acceptable_protocols,
|
||||
strip=True,
|
||||
)
|
||||
|
||||
|
||||
_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
|
||||
tags=acceptable_tags.union({"img", "figure", "figcaption"}),
|
||||
attributes=acceptable_attributes | {"img": ["src", "alt"]},
|
||||
protocols=acceptable_protocols,
|
||||
strip=True,
|
||||
)
|
||||
|
||||
|
||||
def clean_html(text: str):
|
||||
"""Clean the HTML in a string"""
|
||||
return _bleach_cleaner.clean(text)
|
||||
|
||||
|
||||
def liberal_clean_html(text: str):
|
||||
"""More permissively clean the HTML in a string"""
|
||||
return _liberal_bleach_cleaner.clean(text)
|
||||
|
||||
This function undoes what django.utils.html.escape() does
|
||||
"""
|
||||
return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' )
|
||||
|
||||
@keep_lazy(str)
|
||||
def remove_tags(html, tags):
|
||||
"""Returns the given HTML sanitized, and with the given tags removed."""
|
||||
allowed = set(acceptable_tags) - set([ t.lower() for t in tags ])
|
||||
allowed = acceptable_tags - set(t.lower() for t in tags)
|
||||
return bleach.clean(html, tags=allowed, strip=True)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Html fragment cleaning
|
||||
|
||||
def sanitize_fragment(html):
|
||||
return bleach_cleaner.clean(html)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Page cleaning
|
||||
|
||||
|
||||
class Cleaner(lxml.html.clean.Cleaner):
|
||||
charset = 'utf-8'
|
||||
def __init__(self, charset='utf-8', **kw):
|
||||
self.charset = charset
|
||||
super(Cleaner, self).__init__(**kw)
|
||||
|
||||
# Copied from lxml 4.2.0 and modified to insert charset meta:
|
||||
def clean_html(self, html):
|
||||
result_type = type(html)
|
||||
if isinstance(html, (str, bytes)):
|
||||
doc = lxml.html.fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
self(doc)
|
||||
head = doc.find('head')
|
||||
if head != None:
|
||||
meta = lxml.etree.Element('meta', charset=self.charset)
|
||||
meta.tail = '\n'
|
||||
head.insert(0, meta)
|
||||
return lxml.html._transform_result(result_type, doc)
|
||||
|
||||
# We will be saving as utf-8 later, so set that in the meta tag.
|
||||
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
|
||||
|
||||
def sanitize_document(html):
|
||||
return lxml_cleaner.clean_html(html)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Text field cleaning
|
||||
|
@ -86,4 +86,15 @@ def clean_text_field(text):
|
|||
else:
|
||||
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""
|
||||
Returns the given text with ampersands, quotes and angle brackets decoded
|
||||
for use in URLs.
|
||||
|
||||
This function undoes what django.utils.html.escape() does
|
||||
"""
|
||||
return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' )
|
||||
|
||||
|
||||
|
|
|
@ -12,13 +12,15 @@ from markdown.postprocessors import Postprocessor
|
|||
from django.utils.safestring import mark_safe
|
||||
|
||||
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
|
||||
from ietf.utils.text import bleach_cleaner, liberal_bleach_cleaner, bleach_linker
|
||||
from .html import clean_html, liberal_clean_html
|
||||
from .text import linkify
|
||||
|
||||
|
||||
|
||||
class LinkifyExtension(Extension):
|
||||
"""
|
||||
Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify,
|
||||
but using our bleach_linker directly. Doing the linkification on the converted
|
||||
but using our own linker directly. Doing the linkification on the converted
|
||||
Markdown output introduces artifacts.
|
||||
"""
|
||||
|
||||
|
@ -31,12 +33,12 @@ class LinkifyExtension(Extension):
|
|||
|
||||
class LinkifyPostprocessor(Postprocessor):
|
||||
def run(self, text):
|
||||
return urlize_ietf_docs(bleach_linker.linkify(text))
|
||||
return urlize_ietf_docs(linkify(text))
|
||||
|
||||
|
||||
def markdown(text):
|
||||
return mark_safe(
|
||||
bleach_cleaner.clean(
|
||||
clean_html(
|
||||
python_markdown.markdown(
|
||||
text,
|
||||
extensions=[
|
||||
|
@ -52,7 +54,7 @@ def markdown(text):
|
|||
|
||||
def liberal_markdown(text):
|
||||
return mark_safe(
|
||||
liberal_bleach_cleaner.clean(
|
||||
liberal_clean_html(
|
||||
python_markdown.markdown(
|
||||
text,
|
||||
extensions=[
|
||||
|
|
|
@ -11,7 +11,7 @@ from django.utils.safestring import mark_safe
|
|||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
from ietf.utils.text import xslugify as _xslugify, texescape, bleach_linker
|
||||
from ietf.utils.text import linkify as _linkify, xslugify as _xslugify, texescape
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
@ -74,7 +74,7 @@ def texescape_filter(value):
|
|||
@register.filter
|
||||
@stringfilter
|
||||
def linkify(value):
|
||||
text = mark_safe(bleach_linker.linkify(value))
|
||||
text = mark_safe(_linkify(value))
|
||||
return text
|
||||
|
||||
@register.filter
|
||||
|
|
|
@ -1,17 +1,15 @@
|
|||
# Copyright The IETF Trust 2016-2020, All Rights Reserved
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import bleach # type: ignore
|
||||
import copy
|
||||
import bleach
|
||||
import email
|
||||
import re
|
||||
import textwrap
|
||||
import tlds
|
||||
import unicodedata
|
||||
|
||||
from django.core.validators import URLValidator
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.core.validators import URLValidator
|
||||
from django.utils.functional import keep_lazy
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
|
@ -19,66 +17,52 @@ import debug # pyflakes:ignore
|
|||
|
||||
from .texescape import init as texescape_init, tex_escape_map
|
||||
|
||||
tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True)
|
||||
protocols = set(bleach.sanitizer.ALLOWED_PROTOCOLS)
|
||||
protocols.add("ftp") # we still have some ftp links
|
||||
protocols.add("xmpp") # we still have some xmpp links
|
||||
# Sort in reverse so substrings are considered later - e.g., so ".co" comes after ".com".
|
||||
tlds_sorted = sorted(tlds.tld_set, reverse=True)
|
||||
|
||||
tags = set(bleach.sanitizer.ALLOWED_TAGS).union(
|
||||
{
|
||||
# fmt: off
|
||||
'a', 'abbr', 'acronym', 'address', 'b', 'big',
|
||||
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
|
||||
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
|
||||
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
|
||||
'tr', 'tt', 'u', 'ul', 'var'
|
||||
# fmt: on
|
||||
}
|
||||
)
|
||||
# Protocols we're interested in auto-linking. See also ietf.utils.html.acceptable_protocols,
|
||||
# which is protocols we allow people to include explicitly in sanitized html.
|
||||
linkable_protocols = ["http", "https", "mailto", "ftp", "xmpp"]
|
||||
|
||||
attributes = copy.copy(bleach.sanitizer.ALLOWED_ATTRIBUTES)
|
||||
attributes["*"] = ["id"]
|
||||
attributes["ol"] = ["start"]
|
||||
|
||||
bleach_cleaner = bleach.sanitizer.Cleaner(
|
||||
tags=tags, attributes=attributes, protocols=protocols, strip=True
|
||||
)
|
||||
|
||||
liberal_tags = copy.copy(tags)
|
||||
liberal_attributes = copy.copy(attributes)
|
||||
liberal_tags.update(["img","figure","figcaption"])
|
||||
liberal_attributes["img"] = ["src","alt"]
|
||||
|
||||
liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
|
||||
tags=liberal_tags, attributes=liberal_attributes, protocols=protocols, strip=True
|
||||
)
|
||||
|
||||
validate_url = URLValidator()
|
||||
_validate_url = URLValidator()
|
||||
|
||||
|
||||
def check_url_validity(attrs, new=False):
|
||||
"""Callback for bleach linkify
|
||||
|
||||
:param attrs: dict of attributes of the <a> tag
|
||||
:param new: boolean - True if the link is new; False if <a> was found in text
|
||||
:return: new dict of attributes for the link, or None to block link creation
|
||||
|
||||
Attributes are namespaced, so normally look like `(None, "SomeAttribute")`.
|
||||
This includes as the keys in the `attrs` argument, so `attrs[(None, "href")]`
|
||||
would be the value of the href attribute.
|
||||
"""
|
||||
if (None, "href") not in attrs:
|
||||
# rfc2html creates a tags without href
|
||||
return attrs
|
||||
url = attrs[(None, "href")]
|
||||
try:
|
||||
if url.startswith("http"):
|
||||
validate_url(url)
|
||||
_validate_url(url)
|
||||
except ValidationError:
|
||||
return None
|
||||
return attrs
|
||||
|
||||
|
||||
bleach_linker = bleach.Linker(
|
||||
_bleach_linker = bleach.Linker(
|
||||
callbacks=[check_url_validity],
|
||||
url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=protocols),
|
||||
url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=linkable_protocols),
|
||||
email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore
|
||||
parse_email=True,
|
||||
)
|
||||
|
||||
|
||||
def linkify(text):
|
||||
return _bleach_linker.linkify(text)
|
||||
|
||||
|
||||
@keep_lazy(str)
|
||||
def xslugify(value):
|
||||
"""
|
||||
|
|
|
@ -43,8 +43,7 @@ jsonfield>=3.1.0 # for SubmissionCheck. This is https://github.com/bradjaspe
|
|||
jsonschema[format]>=4.2.1
|
||||
jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used.
|
||||
logging_tree>=1.9 # Used only by the showloggers management command
|
||||
lxml>=5.3.0 # lxml[html_clean] fails on some architectures
|
||||
lxml_html_clean>=0.4.1
|
||||
lxml>=5.3.0
|
||||
markdown>=3.3.6
|
||||
types-markdown>=3.3.6
|
||||
mock>=4.0.3 # Used only by tests, of course
|
||||
|
|
Loading…
Reference in a new issue