diff --git a/ietf/doc/templatetags/ietf_filters.py b/ietf/doc/templatetags/ietf_filters.py index 5a8afd995..e1a80a26b 100644 --- a/ietf/doc/templatetags/ietf_filters.py +++ b/ietf/doc/templatetags/ietf_filters.py @@ -26,10 +26,11 @@ import debug # pyflakes:ignore from ietf.doc.models import BallotDocEvent, Document from ietf.doc.models import ConsensusDocEvent from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication -from ietf.utils.html import sanitize_fragment from ietf.utils import log from ietf.doc.utils import prettify_std_name -from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, bleach_linker, bleach_cleaner, validate_url +from ietf.utils.html import clean_html +from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, linkify +from ietf.utils.validators import validate_url register = template.Library() @@ -98,7 +99,7 @@ def sanitize(value): attributes to those deemed acceptable. See ietf/utils/html.py for the details. """ - return mark_safe(sanitize_fragment(value)) + return mark_safe(clean_html(value)) # For use with ballot view @@ -446,16 +447,16 @@ def ad_area(user): @register.filter def format_history_text(text, trunc_words=25): """Run history text through some cleaning and add ellipsis if it's too long.""" - full = mark_safe(bleach_cleaner.clean(text)) - full = bleach_linker.linkify(urlize_ietf_docs(full)) + full = mark_safe(clean_html(text)) + full = linkify(urlize_ietf_docs(full)) return format_snippet(full, trunc_words) @register.filter def format_snippet(text, trunc_words=25): # urlize if there aren't already links present - text = bleach_linker.linkify(text) - full = keep_spacing(collapsebr(linebreaksbr(mark_safe(sanitize_fragment(text))))) + text = linkify(text) + full = keep_spacing(collapsebr(linebreaksbr(mark_safe(clean_html(text))))) snippet = truncatewords_html(full, trunc_words) if snippet != full: return mark_safe('
%s
%s
' % (snippet, full)) diff --git a/ietf/meeting/tests_views.py b/ietf/meeting/tests_views.py index 0647da52a..b52f8de9b 100644 --- a/ietf/meeting/tests_views.py +++ b/ietf/meeting/tests_views.py @@ -6423,8 +6423,7 @@ class MaterialsTests(TestCase): text = doc.text() self.assertIn('Some text', text) self.assertNotIn('
', text) - self.assertIn('charset="utf-8"', text) - + # txt upload test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.') test_file.name = "some.txt" diff --git a/ietf/meeting/utils.py b/ietf/meeting/utils.py index cfe7adfae..4f800980c 100644 --- a/ietf/meeting/utils.py +++ b/ietf/meeting/utils.py @@ -30,7 +30,7 @@ from ietf.group.utils import can_manage_materials from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName from ietf.person.models import Person from ietf.stats.models import MeetingRegistration -from ietf.utils.html import sanitize_document +from ietf.utils.html import clean_html from ietf.utils.log import log from ietf.utils.timezone import date_today @@ -773,8 +773,8 @@ def handle_upload_file(file, filename, meeting, subdir, request=None, encoding=N return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120]) # Whole file sanitization; add back what's missing from a complete # document (sanitize will remove these). - clean = sanitize_document(text) - destination.write(clean.encode('utf8')) + clean = clean_html(text) + destination.write(clean.encode("utf8")) if request and clean != text: messages.warning(request, ( diff --git a/ietf/utils/html.py b/ietf/utils/html.py index 9d0cd7c84..3f3efe2f3 100644 --- a/ietf/utils/html.py +++ b/ietf/utils/html.py @@ -5,11 +5,7 @@ import bleach -import copy import html2text -import lxml.etree -import lxml.html -import lxml.html.clean import debug # pyflakes:ignore @@ -17,62 +13,66 @@ from django import forms from django.utils.functional import keep_lazy from ietf.utils.mime import get_mime_type -from ietf.utils.text import bleach_cleaner, tags as acceptable_tags -acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ] -def unescape(text): - """ - Returns the given text with ampersands, quotes and angle brackets decoded - for use in URLs. +# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares +# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset +# of bleach's defaults. +acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union( + {"http", "https", "mailto", "ftp", "xmpp"} +) +acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union( + { + # fmt: off + "a", "abbr", "acronym", "address", "b", "big", + "blockquote", "body", "br", "caption", "center", "cite", "code", "col", + "colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font", + "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd", + "li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style", + "strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead", + "tr", "tt", "u", "ul", "var" + # fmt: on + } +) +acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | { + "*": ["id"], + "ol": ["start"], +} + + +# Instantiate sanitizer classes +_bleach_cleaner = bleach.sanitizer.Cleaner( + tags=acceptable_tags, + attributes=acceptable_attributes, + protocols=acceptable_protocols, + strip=True, +) + + +_liberal_bleach_cleaner = bleach.sanitizer.Cleaner( + tags=acceptable_tags.union({"img", "figure", "figcaption"}), + attributes=acceptable_attributes | {"img": ["src", "alt"]}, + protocols=acceptable_protocols, + strip=True, +) + + +def clean_html(text: str): + """Clean the HTML in a string""" + return _bleach_cleaner.clean(text) + + +def liberal_clean_html(text: str): + """More permissively clean the HTML in a string""" + return _liberal_bleach_cleaner.clean(text) - This function undoes what django.utils.html.escape() does - """ - return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' ) @keep_lazy(str) def remove_tags(html, tags): """Returns the given HTML sanitized, and with the given tags removed.""" - allowed = set(acceptable_tags) - set([ t.lower() for t in tags ]) + allowed = acceptable_tags - set(t.lower() for t in tags) return bleach.clean(html, tags=allowed, strip=True) -# ---------------------------------------------------------------------- -# Html fragment cleaning - -def sanitize_fragment(html): - return bleach_cleaner.clean(html) - -# ---------------------------------------------------------------------- -# Page cleaning - - -class Cleaner(lxml.html.clean.Cleaner): - charset = 'utf-8' - def __init__(self, charset='utf-8', **kw): - self.charset = charset - super(Cleaner, self).__init__(**kw) - - # Copied from lxml 4.2.0 and modified to insert charset meta: - def clean_html(self, html): - result_type = type(html) - if isinstance(html, (str, bytes)): - doc = lxml.html.fromstring(html) - else: - doc = copy.deepcopy(html) - self(doc) - head = doc.find('head') - if head != None: - meta = lxml.etree.Element('meta', charset=self.charset) - meta.tail = '\n' - head.insert(0, meta) - return lxml.html._transform_result(result_type, doc) - -# We will be saving as utf-8 later, so set that in the meta tag. -lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8') - -def sanitize_document(html): - return lxml_cleaner.clean_html(html) - # ---------------------------------------------------------------------- # Text field cleaning @@ -86,4 +86,15 @@ def clean_text_field(text): else: raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type) return text - + + +def unescape(text): + """ + Returns the given text with ampersands, quotes and angle brackets decoded + for use in URLs. + + This function undoes what django.utils.html.escape() does + """ + return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' ) + + diff --git a/ietf/utils/markdown.py b/ietf/utils/markdown.py index 446d34895..0b522685b 100644 --- a/ietf/utils/markdown.py +++ b/ietf/utils/markdown.py @@ -12,13 +12,15 @@ from markdown.postprocessors import Postprocessor from django.utils.safestring import mark_safe from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs -from ietf.utils.text import bleach_cleaner, liberal_bleach_cleaner, bleach_linker +from .html import clean_html, liberal_clean_html +from .text import linkify + class LinkifyExtension(Extension): """ Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify, - but using our bleach_linker directly. Doing the linkification on the converted + but using our own linker directly. Doing the linkification on the converted Markdown output introduces artifacts. """ @@ -31,12 +33,12 @@ class LinkifyExtension(Extension): class LinkifyPostprocessor(Postprocessor): def run(self, text): - return urlize_ietf_docs(bleach_linker.linkify(text)) + return urlize_ietf_docs(linkify(text)) def markdown(text): return mark_safe( - bleach_cleaner.clean( + clean_html( python_markdown.markdown( text, extensions=[ @@ -52,7 +54,7 @@ def markdown(text): def liberal_markdown(text): return mark_safe( - liberal_bleach_cleaner.clean( + liberal_clean_html( python_markdown.markdown( text, extensions=[ diff --git a/ietf/utils/templatetags/textfilters.py b/ietf/utils/templatetags/textfilters.py index 70b94cf67..3b240740e 100644 --- a/ietf/utils/templatetags/textfilters.py +++ b/ietf/utils/templatetags/textfilters.py @@ -11,7 +11,7 @@ from django.utils.safestring import mark_safe import debug # pyflakes:ignore -from ietf.utils.text import xslugify as _xslugify, texescape, bleach_linker +from ietf.utils.text import linkify as _linkify, xslugify as _xslugify, texescape register = template.Library() @@ -74,7 +74,7 @@ def texescape_filter(value): @register.filter @stringfilter def linkify(value): - text = mark_safe(bleach_linker.linkify(value)) + text = mark_safe(_linkify(value)) return text @register.filter diff --git a/ietf/utils/text.py b/ietf/utils/text.py index 2fba113d0..4e5d5b6cd 100644 --- a/ietf/utils/text.py +++ b/ietf/utils/text.py @@ -1,17 +1,15 @@ # Copyright The IETF Trust 2016-2020, All Rights Reserved # -*- coding: utf-8 -*- - -import bleach # type: ignore -import copy +import bleach import email import re import textwrap import tlds import unicodedata -from django.core.validators import URLValidator from django.core.exceptions import ValidationError +from django.core.validators import URLValidator from django.utils.functional import keep_lazy from django.utils.safestring import mark_safe @@ -19,66 +17,52 @@ import debug # pyflakes:ignore from .texescape import init as texescape_init, tex_escape_map -tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True) -protocols = set(bleach.sanitizer.ALLOWED_PROTOCOLS) -protocols.add("ftp") # we still have some ftp links -protocols.add("xmpp") # we still have some xmpp links +# Sort in reverse so substrings are considered later - e.g., so ".co" comes after ".com". +tlds_sorted = sorted(tlds.tld_set, reverse=True) -tags = set(bleach.sanitizer.ALLOWED_TAGS).union( - { - # fmt: off - 'a', 'abbr', 'acronym', 'address', 'b', 'big', - 'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd', - 'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style', - 'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead', - 'tr', 'tt', 'u', 'ul', 'var' - # fmt: on - } -) +# Protocols we're interested in auto-linking. See also ietf.utils.html.acceptable_protocols, +# which is protocols we allow people to include explicitly in sanitized html. +linkable_protocols = ["http", "https", "mailto", "ftp", "xmpp"] -attributes = copy.copy(bleach.sanitizer.ALLOWED_ATTRIBUTES) -attributes["*"] = ["id"] -attributes["ol"] = ["start"] -bleach_cleaner = bleach.sanitizer.Cleaner( - tags=tags, attributes=attributes, protocols=protocols, strip=True -) - -liberal_tags = copy.copy(tags) -liberal_attributes = copy.copy(attributes) -liberal_tags.update(["img","figure","figcaption"]) -liberal_attributes["img"] = ["src","alt"] - -liberal_bleach_cleaner = bleach.sanitizer.Cleaner( - tags=liberal_tags, attributes=liberal_attributes, protocols=protocols, strip=True -) - -validate_url = URLValidator() +_validate_url = URLValidator() def check_url_validity(attrs, new=False): + """Callback for bleach linkify + + :param attrs: dict of attributes of the tag + :param new: boolean - True if the link is new; False if was found in text + :return: new dict of attributes for the link, or None to block link creation + + Attributes are namespaced, so normally look like `(None, "SomeAttribute")`. + This includes as the keys in the `attrs` argument, so `attrs[(None, "href")]` + would be the value of the href attribute. + """ if (None, "href") not in attrs: # rfc2html creates a tags without href return attrs url = attrs[(None, "href")] try: if url.startswith("http"): - validate_url(url) + _validate_url(url) except ValidationError: return None return attrs -bleach_linker = bleach.Linker( +_bleach_linker = bleach.Linker( callbacks=[check_url_validity], - url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=protocols), + url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=linkable_protocols), email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore parse_email=True, ) +def linkify(text): + return _bleach_linker.linkify(text) + + @keep_lazy(str) def xslugify(value): """ diff --git a/requirements.txt b/requirements.txt index ae8b06fae..073a6bfa0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,8 +43,7 @@ jsonfield>=3.1.0 # for SubmissionCheck. This is https://github.com/bradjaspe jsonschema[format]>=4.2.1 jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used. logging_tree>=1.9 # Used only by the showloggers management command -lxml>=5.3.0 # lxml[html_clean] fails on some architectures -lxml_html_clean>=0.4.1 +lxml>=5.3.0 markdown>=3.3.6 types-markdown>=3.3.6 mock>=4.0.3 # Used only by tests, of course